<a href="https://colab.research.google.com/github/cakennedy/266-mbti-project/blob/main/T5Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# W266 Final Project
# Resampling
# October 27, 2022
# John Clark, Shrinivas Joshi, Courtney Kennedy

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cakennedy/266-mbti-project/blob/main/notebooks/T5Model.ipynb#)


# Create a T5 Model and run it on our data

In [2]:
# imports
import sys
import csv


import pandas as pd
import io
from io import BytesIO
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
import textwrap

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.utils import resample
from sklearn.utils import shuffle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
# Configuration Flags

useGCloudForStorage = True


In [4]:
# Global State Flags
uploadedGCloudCredentials = False
mountedGoogleDrive = False
configuredGCloud = False


# Global Variables
gdrive_path = '/content/drive/MyDrive/content/drive/'

gcloud_bucket = None
gcloud_bucket_name = ""

In [5]:
# To work from a google drive use this:
from google.colab import drive

def mount_google_drive():
    global userGCloudForStorage
    global mountedGoogleDrive

    if useGCloudForStorage == False:
        if mountedGoogleDrive == False:
            drive.mount('/content/drive')
            mountedGoogleDrive = True


In [6]:
# To work from gcloud, use this:

# Upload Google Cloud service account key to enable authentication ( json file )
# Go to https://console.cloud.google.com/:
# Under the Navigation Menu ( upper left 3 horizontal lines) 
# 1. choose IAM & Admin>
# 2. choose Service Accounts>
# 3. Select a Service Account>
# 4. Under the Actions menu ( 3 dots to the right of the service account )>Manage Keys to create your own json credentials file

from google.colab import files
from google.cloud import storage


def upload_gcloud_credentials():
    global useGCloudForStorage
    global uploadedGCloudCredentials

    if useGCloudForStorage:
        if uploadedGCloudCredentials == False:

            uploaded = files.upload()
            uploadedGCloudCredentials = True


def config_GCloud():
    global configuredGCloud
    global gcloud_bucket
    global gcloud_bucket_name

    if configuredGCloud:
        return gcloud_bucket, gcloud_bucket_name

    #Load Google Cloud storage client using service key
    storage_client = storage.Client.from_service_account_json('pacific-castle-360400-a3ca89f64de6.json')
    #Print buckets available
    for bucket in storage_client.list_buckets():
        print(bucket)

    #Assign bucket name being used
    gcloud_bucket_name = '266csffile'

    #Get bucket
    gcloud_bucket = storage_client.get_bucket(gcloud_bucket_name)

    #Show list of files in bucket and list the files
    filename = list(gcloud_bucket.list_blobs(prefix=''))
    for name in filename:
        print(name.name)

    #Increase field size to allow reading in of files
    maxInt = sys.maxsize

    while True:
        # decrease the maxInt value by factor 10 as long as overflow error occurs 
        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt/10)

    configuredGCloud = True

    return gcloud_bucket, gcloud_bucket_name


In [7]:
# write parquet files

def write_parquet_google_cloud( df, filename):

    bucket, bucket_name = config_GCloud()

    blob = bucket.blob( filename )
    blob.upload_from_string(df.to_parquet(), 'application/octet-stream')



In [8]:
def read_parquet_from_gcloud( filename ):

    bucket, bucket_name = config_GCloud()

    blob = bucket.blob( filename )
    blob_string = blob.download_as_string()
    
    read_df = pd.read_parquet(io.BytesIO(blob_string))
    return read_df


In [9]:
upload_gcloud_credentials()
bucket, bucket_name = config_GCloud()



Saving pacific-castle-360400-a3ca89f64de6.json to pacific-castle-360400-a3ca89f64de6.json
<Bucket: 266csffile>
dev_is_I_data.parquet
dev_is_I_labels.parquet
dev_is_J_data.parquet
dev_is_J_labels.parquet
dev_is_S_data.parquet
dev_is_S_labels.parquet
dev_is_T_data.parquet
dev_is_T_labels.parquet
dev_mbti_data.parquet
dev_mbti_labels.parquet
old_files/
old_files/dev_mbti_data.csv
old_files/dev_mbti_data.parquet
old_files/dev_mbti_labels.csv
old_files/test_mbti_data.csv
old_files/test_mbti_data.parquet
old_files/test_mbti_labels.csv
old_files/train_gen_pop_mbti_data.csv
old_files/train_gen_pop_mbti_data.parquet
old_files/train_gen_pop_mbti_labels.csv
old_files/train_mbti_data.csv
old_files/train_mbti_data.parquet
old_files/train_mbti_labels.csv
old_files/train_over_sampled_mbti_data.csv
old_files/train_over_sampled_mbti_data.parquet
old_files/train_over_sampled_mbti_labels.csv
old_files/train_under_sampled_mbti_data.csv
old_files/train_under_sampled_mbti_data.parquet
old_files/train_under_

In [11]:
# Start with MBTI Training Data

mbti_train_data_df = read_parquet_from_gcloud('train_mbti_data.parquet')
mbti_train_labels_df  = read_parquet_from_gcloud('train_mbti_labels.parquet')

mbti_train_uniform_data_df  = read_parquet_from_gcloud('train_uniform_mbti_data.parquet')
mbti_train_uniform_labels_df  = read_parquet_from_gcloud('train_uniform_mbti_labels.parquet')


mbti_train_genpop_data_df  = read_parquet_from_gcloud('train_gen_pop_mbti_data.parquet')
mbti_train_genpop_labels_df  = read_parquet_from_gcloud('train_gen_pop_mbti_labels.parquet')

In [12]:
# Also get MBTI Dev and Test Data

mbti_dev_data_df  = read_parquet_from_gcloud('dev_mbti_data.parquet')
mbti_dev_labels_df  = read_parquet_from_gcloud('dev_mbti_labels.parquet')

mbti_test_data_df  = read_parquet_from_gcloud('test_mbti_data.parquet')
mbti_test_labels_df  = read_parquet_from_gcloud('test_mbti_labels.parquet')

In [13]:
# Sanity check the files

df_dict = {}

df_dict['train_data'] = mbti_train_data_df
df_dict['train_labels'] = mbti_train_labels_df

df_dict['train_uniform_data'] = mbti_train_uniform_data_df
df_dict['train_uniform_labels'] = mbti_train_genpop_labels_df

df_dict['train_genpop_data'] = mbti_train_genpop_data_df
df_dict['train_genpop_labels'] = mbti_train_uniform_labels_df

df_dict['dev_data'] = mbti_dev_data_df
df_dict['dev_labels'] = mbti_dev_labels_df

df_dict['test_data'] = mbti_test_data_df
df_dict['test_labels'] = mbti_test_labels_df



In [14]:
for df in df_dict.values():
    print( "Shape:", df.shape)
    print( "Columns:", df.columns)
    print( "Head:", df.head() )

Shape: (1200000, 16)
Columns: Index(['original index', 'Username', 'Age', 'Posts', 'Enneagram',
       'Instinctual Variant', 'Gender', 'Occupation', 'is_I', 'is_S', 'is_T',
       'is_J', 'post_id', 'thread_id', 'post_date', 'message'],
      dtype='object')
Head:    original index      Username     Age  Posts Enneagram Instinctual Variant  \
0          275024      Coriolis    45.0  26905       5w6               sp_sx   
1          913620    metaphours    42.0   1194       4w5                None   
2          149964     Biaxident  2022.0   3617      None                None   
3         1286867  Siúil a Rúin    51.0  13644       496               sx_sp   
4          203249      Carebear    43.0   1449      None                None   

          Gender                                     Occupation  is_I   is_S  \
0           male                                           None  True  False   
1           None                                         living  True  False   
2           N