In [1]:
# Core imports
import numpy as np
import pandas as pd
from collections import Counter

# Pre-processing imports
import nltk


nltk.download('punkt')


# Model-building imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# BOW
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Word embedding imports
import gensim
from gensim.models import Word2Vec

# Visualize
import seaborn as sns
from tqdm.notebook import tqdm_notebook

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\atabekis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Reading the dataset and splitting into train and test

In [None]:
df = pd.read_csv('data/cleaned_extrovert.csv', engine='pyarrow')

## EDA on the data

Class distributions -> imbalanced data
We can clearly see that there are more introvert classification compared to extrovert classification
-> do some reading on SMOTE

In [None]:
label_counts = df['label'].value_counts()
sns.barplot(label_counts)

Check for the number of characters in a user's posts -> number of words wont work because the data we have is split by 1500 space-separated entries
We don't see a big difference between the two:
introverted: 5575 chars
extroverted: 5693 chars

In [None]:
df['char_count'] = df['post'].apply(lambda x: len(str(x)))
print(df[df['label']==1]['char_count'].mean())
print(df[df['label']==0]['char_count'].mean())

## Vectorization

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['post'], df['label'], test_size=0.2, shuffle=True, random_state=5)
# Random state = 5 because I want reproducibility in the .npy files

Word2Vec

In [None]:
def tokenize_pbar(data):
    tokenized_pbar = tqdm_notebook(data, desc="Tokenizing sentences", unit="sentence")
    out = [nltk.word_tokenize(sentence) for sentence in tokenized_pbar]
    tokenized_pbar.close()
    return out

The cell below takes approximately 7-8 minutes to execute, set run=False to load from saved tokenized arrays -> this is the reason we're using random state = 5 in train test split.

In [None]:
run = False 
if run:
    x_train_tok = tokenize_pbar(x_train)
    x_test_tok = tokenize_pbar(x_test)
    
    x_train_tok_array = np.asarray(x_train_tok, dtype='object') # Numpy was having issues with converting,
    x_test_tok_array = np.asarray(x_test_tok, dtype='object') # Therefore, we force object dtype
    
    np.save('data/arrays/tokens_train.npy', x_train_tok_array)
    np.save('data/arrays/tokens_test.npy', x_test_tok_array)
else:
    x_train_tok = np.load('data/arrays/tokens_train.npy', allow_pickle=True)
    x_test_tok = np.load('data/arrays/tokens_test.npy', allow_pickle=True)

Tf*iDf

In [None]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
x_train_vectors_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_vectors_tfidf = tfidf_vectorizer.transform(x_test)

Word2Vec model

In [None]:
class MeanEmbeddingVectorizer:
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(next(iter(word2vec.values())))
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        
        transform_pbar = tqdm_notebook(X, desc="Transforming into vectors", unit="token")
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0)
            for words in transform_pbar
        ])


In [None]:
df['tokens'] = tokenize_pbar(df['post'])
df.head()

running the code below takes a long time
TODO: model.save()

In [None]:
model = Word2Vec(df['tokens'], min_count=1) 
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))

In [None]:
model_w = MeanEmbeddingVectorizer(w2v)
x_train_vectors_w2v = model_w.transform(x_train_tok)
x_test_vectors_w2v = model_w.transform(x_test_tok)

# We're ready to do some ML :)

### Logistic Regression -- tf*idf

In [None]:
lr_tfidf = LogisticRegression(solver='liblinear', C=10, penalty='l2')
lr_tfidf.fit(x_train_vectors_tfidf, y_train)

In [None]:
y_predict = lr_tfidf.predict(x_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(x_test_vectors_tfidf)[:,1]

In [None]:
print(classification_report(y_test, y_predict))

In [None]:
confusion_matrix(y_test, y_predict)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

### Logistic Regression -- Word2Vec

In [None]:
lr_w2v = LogisticRegression(solver='liblinear', C=10, penalty='l2')
lr_w2v.fit(x_train_vectors_w2v, y_train)

In [None]:
y_predict = lr_w2v.predict(x_test_vectors_w2v)
y_prob = lr_w2v.predict_proba(x_test_vectors_w2v)[:,1]

In [None]:
print(classification_report(y_test,y_predict))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_predict))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

# Findings
1. Since we have a very imbalanced dataset, we're obtaining high AUC scores
    i. we can use the other metrics to explore the true power of the models 
2. We can intrude a method called Synthetic Minority Over-Sampling Technique or SMOTE for short
3. Use cross-validation !!


In [None]:
df = pd.read_csv('data/extrovert_introvert.csv', engine='pyarrow')
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['post'], df['extrovert'], test_size=0.2, random_state=5)

In [None]:
from sklearn.pipeline import Pipeline

pipelineMNB = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])
pipelineCNB = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', ComplementNB())
])
pipelineSVC = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [None]:
pipelineMNB.fit(X_train, y_train)
pred_MNB = pipelineMNB.predict(X_test)

In [None]:
print(classification_report(y_test, pred_MNB))

Complement Naive Bayes apparently works best with imbalanced data.

In [None]:
pipelineCNB.fit(X_train, y_train)
pred_CNB = pipelineCNB.predict(X_test)

In [None]:
print(classification_report(y_test, pred_CNB))

# Some oversampling techniques

### Synthetic Minority Oversampling Technique (SMOTE)

In [None]:
!pip install imbalanced-learn

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as PipelineSmote
df = pd.read_csv('../data/cleaned_extrovert.csv', engine='pyarrow')
df.head()

Unnamed: 0,author_id,post,label
0,t2_2hrxxs28,"question, doctor, how'd get painkillers? otc p...",0
1,t2_2hrxxs28,butt covid + cycle. i'm sure what's going i've...,0
2,t2_2hrxxs28,different doctors. situation sucks relate peop...,0
3,t2_4pxpgwz,thought pebbleyeet guy autistic guy wants “fix...,0
4,t2_4pxpgwz,…i always end voting wrong even crewmate. hour...,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['post'], df['label'], test_size=0.2, random_state=5)


In [None]:
pipeline_smote = PipelineSmote([
    ('tfidf', TfidfVectorizer()),
    ('smote', SMOTE(random_state=5)),
    ('classifier', ComplementNB())
])

pipeline_smote.fit(X_train, y_train)


In [None]:
y_pred = pipeline_smote.predict(X_test)
print(classification_report(y_test, y_pred))


Testing the pipelines laid out in process.py

In [6]:
from methods.process import build_pipeline, model_keys
model_keys

['naive-bayes', 'svm', 'logistic', 'random-forest']

In [10]:
%%time
pipeline = build_pipeline('naive-bayes')
pipeline.fit(X_train, y_train)

CPU times: total: 3min 59s
Wall time: 4min 52s


In [None]:
%%time
findings = dict()
for model in model_keys:
    pipeline = build_pipeline(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:,1]
    fpr, tpr, _thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    print(model)
    print('AUC:', roc_auc)
    findings[model] = {'fpr': fpr, 'tpr': tpr, 'thresholds': _thresholds}