In [13]:
%matplotlib inline

!pip install geonamescache
!pip install nltk
!pip install wordcloud

# General libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import *
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics

# SK-learn libraries for feature extraction from text
from sklearn.feature_extraction.text import *

# NLP processors
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize, FreqDist
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from numpy import mean
from numpy import std

# WordCloud
from wordcloud import WordCloud



[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1076)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1076)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1076)>


ModuleNotFoundError: No module named 'wordcloud'

In [2]:
#read in data
# df = pd.read_csv(r'C:\Users\lwu31\OneDrive - JNJ\Documents\train.csv')
df = pd.read_csv('data/nlp-getting-started/train.csv')
# sample the data, acts as shuffling the data on row

#50/50 split between train and dev
# allocate more for traiing if we do it this way, i'll run some
# analysis to see if my cluster bootstrap can imrpove the models we run.
numtest = int(len(df)/5)
df_train = df[numtest:].reset_index(drop=True)
df_test = df[:numtest].reset_index(drop=True)

train_data, train_label = df_train.text, df_train.target
test_data, test_label = df_test.text, df_test.target

#split into disaster and non disaster data
df_neg = df_train.loc[df_train.target == 0]
df_pos = df_train.loc[df_train.target == 1]

#split into disaster and nondisaster tweets only
neg_text = df_neg.text
pos_text = df_pos.text

print("Some data metrics\n")
print("Shape of train data:", df_train.shape)
print("\nMissing data in each column:\n" + str(df.isnull().sum()))
print("\nNumber of disaster tweets:\n"+ str(train_label.value_counts()))

Some data metrics

Shape of train data: (6091, 5)

Missing data in each column:
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

Number of disaster tweets:
0    3396
1    2695
Name: target, dtype: int64


In [3]:
# Preprocess the data prior to running the model using this function
# TODO: Here I've commented out the punctuation regex section
# Feel free to uncomment that part BUT I think we should leave it out
# altogether. See below for usage.

def preprocess(text, method=None, tokenizer=sent_tokenize, rm_stop=False): 
    """Returns a text processed string.

    Arguments:
    text      -- String, func is designed for loops
    
    method    -- ('s','l') Specify from s - stemmatize, l - lemmatize.
                 None will mean you do not want to remove suffix.
                 
    tokenizer -- Any tokenizer function, from word to sentence to tweet.
                 Tokenizer must not be an object.method unless you
                 specifiy it to be like TweetTokenizer.tokenize.
                 Sentence tokenizer is initialized here.
                 
    rm_stop   -- Bool. Remove stop words or not.
    """

    #remove line breaks
    text = re.sub(r"\n","",text)
    #remove trailing spaces
    text = re.sub(r'[ \t]+$','', text)
    #convert to lowercase 
    text = text.lower()
    #remove digits and currencies 
    text = re.sub(r"\d+","",text) 
    text = re.sub(r'[\$\d+\d+\$]', "", text)
    #remove dates 
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    #remove non-ascii
    text = re.sub(r'[^\x00-\x7f]',r' ',text) 
    # Replacing all links with standard link
    #text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) 
    #text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    text = re.sub(link_regex, "http://t.co", text)
    # separate out mention symbol from text so that models can learn from number of mentions
    p = re.compile(mention_regex)
    text = p.sub(r'@ \1',text)

    # separate out hashtag symbol from hashtag so that models can learn from number of hashtags
    q = re.compile(hashtag_regex)
    text = q.sub(r'# \1',text)
    
    # remove retweet indicator text as it's rarely used
    text = re.sub(retweet_indicator, "", text)
    
    #remove punctuation
    # Leave it? or talking point?!
    #text = re.sub(r'[^\w\s]','',text)
    
    if rm_stop:
        filtered_tokens = [word for word in tokenizer(text) 
                           if not word in set(stopwords.words('english'))]
        text = " ".join(filtered_tokens)
        
    if method == 'l':
        lemmer = WordNetLemmatizer()
        lemm_tokens = [lemmer.lemmatize(word) 
                       for word in tokenizer(text)]
        return " ".join(lemm_tokens)
    
    elif method == 's':
        porter = PorterStemmer()
        stem_tokens = [porter.stem(word) 
                       for word in tokenizer(text)]
        return " ".join(stem_tokens)
    return text

# GMM

In [27]:
train_data = df
train_labels = train_data.target
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [53]:

def run_model_on_preprocessed_text(preprocessed_text):
    tfidf = TfidfVectorizer()
    transformed_data = tfidf.fit_transform(preprocessed_text)
    model = MultinomialNB(alpha=0.9) # Best alpha from project 3
    accuracy_scores = cross_val_score(model, transformed_data, train_labels, scoring='accuracy', cv=cv)
    f1_scores = cross_val_score(model, transformed_data, train_labels, scoring='f1', cv=cv)
    print('Accuracy: %.3f (%.3f)' % (mean(accuracy_scores), std(accuracy_scores)))
    print('F1 Scores: %.3f (%.3f)' % (mean(f1_scores), std(f1_scores)))

def preprocess_text():
    return [preprocess(i) for i in train_data.text]
    
def preprocess_text_with_additional_cleaning(tokenizer):
    return [preprocess(i,method='s',tokenizer=tokenizer,rm_stop=True) for i in train_data.text]


In [44]:

def run_model_on_transformed_data(model, transformed_data, params):
    print("Fitting GridSearch to optimize accuracy (this takes a while)...")
    accuracy_gridsearch_model = fit_gridsearch_model(model, transformed_data, params, 'accuracy')
    print("Fitting GridSearch to optimize f1 score (this takes a while)...")
    f1_gridsearch_model = fit_gridsearch_model(model, transformed_data, params, 'f1')

    print('Best params for accuracy: ', accuracy_gridsearch_model.best_params_)
    print('Best mean score for accuracy: {:.2%}'.format(accuracy_gridsearch_model.best_score_))

    print('Best params for f1: ', f1_gridsearch_model.best_params_)
    print('Best mean score for f1: {:.2%}'.format(f1_gridsearch_model.best_score_))
    
def fit_gridsearch_model(model, data, params, scoring):
    gridsearch_model = GridSearchCV(model, param_grid=params, cv=cv, scoring=scoring, n_jobs=-1, verbose=3)
    return gridsearch_model.fit(data, train_labels)

def transform_data_with_count_vectorizer(preprocessed_text):
    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(preprocessed_text)
       
def transform_data_with_tfidf(preprocessed_text):
    tfidf = TfidfVectorizer()
    return tfidf.fit_transform(preprocessed_text)

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture


preprocessed_text = preprocess_text()
preprocessed_text_with_additional_cleaning = preprocess_text_with_additional_cleaning(word_tokenize)

tfidf_transformed = transform_data_with_tfidf(preprocessed_text)
count_vectorizer_transformed = transform_data_with_count_vectorizer(preprocessed_text)

tfidf_transformed_cleaned = transform_data_with_tfidf(preprocessed_text_with_additional_cleaning)
count_vectorizer_transformed_cleaned = transform_data_with_count_vectorizer(preprocessed_text_with_additional_cleaning)

def svd_for_gmm(vectorized_text):
    svd = TruncatedSVD()
    return svd.fit_transform(vectorized_text)

def run_model_on_transformed_data_gmm(model, transformed_data, params):
    print("Fitting GridSearch to optimize accuracy (this takes a while)...")
    accuracy_gridsearch_model = fit_gridsearch_model(model, transformed_data, params, 'accuracy',average='weighted')
    print("Fitting GridSearch to optimize f1 score (this takes a while)...")
    f1_gridsearch_model = fit_gridsearch_model(model, transformed_data, params, 'f1', average='weighted')

    print('Best params for accuracy: ', accuracy_gridsearch_model.best_params_)
    print('Best mean score for accuracy: {:.2%}'.format(accuracy_gridsearch_model.best_score_))

    print('Best params for f1: ', f1_gridsearch_model.best_params_)
    print('Best mean score for f1: {:.2%}'.format(f1_gridsearch_model.best_score_))
gmm    = GaussianMixture(random_state=12345)
params = {
          'covariance_type' : ['spherical', 'diag', 'tied', 'full'],
          'n_components'    : [1, 2, 4, 6,10],
         }

run_model_on_transformed_data(gmm, svd_for_gmm(count_vectorizer_transformed), params)
run_model_on_transformed_data(gmm, svd_for_gmm(tfidf_transformed), params)
run_model_on_transformed_data(gmm, svd_for_gmm(tfidf_transformed_cleaned), params)
run_model_on_transformed_data(gmm, svd_for_gmm(count_vectorizer_transformed_cleaned), params)

Fitting GridSearch to optimize accuracy (this takes a while)...
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting GridSearch to optimize f1 score (this takes a while)...
Fitting 10 folds for each of 20 candidates, totalling 200 fits


KeyboardInterrupt: 

In [55]:
count_vectorizer_transformed.

<7613x16378 sparse matrix of type '<class 'numpy.int64'>'
	with 104109 stored elements in Compressed Sparse Row format>

In [7]:

# TESTING no text stripping
np.random.seed(0)
df_ = df.sample(frac=1).reset_index()



#Split training data into disaster vs non-disaster
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
test_ = tfidf.transform(test_data)
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
test_ = svd.transform(test_)

t_data_d = t_data[train_labels==1]
t_data_nd = t_data[train_labels==0]


# Create two gmm models, one for each class
# implement GridSearchCV
gmm_d = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_d)



gmm_nd = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_nd)


# Compute log probabilities and run np.exp on them 
# to get the probabilities.
p_disaster = np.exp(gmm_d.score_samples(test_))
np_disaster = np.exp(gmm_nd.score_samples(test_))

# Use boolean np.where checker to get predicted labels given by gmm
# Display accuracy score after.
preds = np.where(p_disaster > np_disaster, 1, 0)
print('Accuracy {}: {:.2f}%'.format('no text stripping w/ sw', metrics.accuracy_score(test_labels, preds)*100))


# no text stripping w/o SW

np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i,rm_stop=True))
df_.text = processed_full
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_labels = df_train.text, df_train.target
test_data, test_labels = df_test.text, df_test.target

#Split training data into disaster vs non-disaster
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
test_ = tfidf.transform(test_data)
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
test_ = svd.transform(test_)

t_data_d = t_data[train_labels==1]
t_data_nd = t_data[train_labels==0]

# Create two gmm models, one for each class
# implement GridSearchCV
gmm_d = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_d)

gmm_nd = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_nd)


# Compute log probabilities and run np.exp on them 
# to get the probabilities.
p_disaster = np.exp(gmm_d.score_samples(test_))
np_disaster = np.exp(gmm_nd.score_samples(test_))

# Use boolean np.where checker to get predicted labels given by gmm
# Display accuracy score after.
preds = np.where(p_disaster > np_disaster, 1, 0)
print('Accuracy {}: {:.2f}%'.format('no text stripping w/o sw', metrics.accuracy_score(test_labels, preds)*100))



# Lemmatize no stop word removal
np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i,method='l'))
df_.text = processed_full
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_labels = df_train.text, df_train.target
test_data, test_labels = df_test.text, df_test.target

#Split training data into disaster vs non-disaster
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
test_ = tfidf.transform(test_data)
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
test_ = svd.transform(test_)

t_data_d = t_data[train_labels==1]
t_data_nd = t_data[train_labels==0]


# Create two gmm models, one for each class
# implement GridSearchCV
gmm_d = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_d)

gmm_nd = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_nd)


# Compute log probabilities and run np.exp on them 
# to get the probabilities.
p_disaster = np.exp(gmm_d.score_samples(test_))
np_disaster = np.exp(gmm_nd.score_samples(test_))

# Use boolean np.where checker to get predicted labels given by gmm
# Display accuracy score after.
preds = np.where(p_disaster > np_disaster, 1, 0)
print('Accuracy {}: {:.2f}%'.format('lemmatize w/SW', metrics.accuracy_score(test_labels, preds)*100))

# Lemmatize stop word removal
np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i,method='l', rm_stop=True))
df_.text = processed_full
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_labels = df_train.text, df_train.target
test_data, test_labels = df_test.text, df_test.target

#Split training data into disaster vs non-disaster
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
test_ = tfidf.transform(test_data)
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
test_ = svd.transform(test_)

t_data_d = t_data[train_labels==1]
t_data_nd = t_data[train_labels==0]


# Create two gmm models, one for each class
# implement GridSearchCV
gmm_d = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_d)

gmm_nd = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_nd)


# Compute log probabilities and run np.exp on them 
# to get the probabilities.
p_disaster = np.exp(gmm_d.score_samples(test_))
np_disaster = np.exp(gmm_nd.score_samples(test_))

# Use boolean np.where checker to get predicted labels given by gmm
# Display accuracy score after.
preds = np.where(p_disaster > np_disaster, 1, 0)
print('Accuracy {}: {:.2f}%'.format('lemmatize w/o SW', metrics.accuracy_score(test_labels, preds)*100))

# stemmatize w/o stop word removal
np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i,method='s', rm_stop=False))
df_.text = processed_full
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_labels = df_train.text, df_train.target
test_data, test_labels = df_test.text, df_test.target

#Split training data into disaster vs non-disaster
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
test_ = tfidf.transform(test_data)
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
test_ = svd.transform(test_)

t_data_d = t_data[train_labels==1]
t_data_nd = t_data[train_labels==0]


# Create two gmm models, one for each class
# implement GridSearchCV
gmm_d = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_d)

gmm_nd = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_nd)


# Compute log probabilities and run np.exp on them 
# to get the probabilities.
p_disaster = np.exp(gmm_d.score_samples(test_))
np_disaster = np.exp(gmm_nd.score_samples(test_))

# Use boolean np.where checker to get predicted labels given by gmm
# Display accuracy score after.
preds = np.where(p_disaster > np_disaster, 1, 0)
print('Accuracy {}: {:.2f}%'.format('stem w/ SW', metrics.accuracy_score(test_labels, preds)*100))


# stemmatize w stop word removal
np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i,method='s', rm_stop=True))
df_.text = processed_full
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_labels = df_train.text, df_train.target
test_data, test_labels = df_test.text, df_test.target

#Split training data into disaster vs non-disaster
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
test_ = tfidf.transform(test_data)
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
test_ = svd.transform(test_)

t_data_d = t_data[train_labels==1]
t_data_nd = t_data[train_labels==0]


# Create two gmm models, one for each class
# implement GridSearchCV
gmm_d = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_d)

gmm_nd = GaussianMixture(n_components=10,
                            covariance_type='full', 
                            random_state=12345).fit(t_data_nd)


# Compute log probabilities and run np.exp on them 
# to get the probabilities.
p_disaster = np.exp(gmm_d.score_samples(test_))
np_disaster = np.exp(gmm_nd.score_samples(test_))

# Use boolean np.where checker to get predicted labels given by gmm
# Display accuracy score after.
preds = np.where(p_disaster > np_disaster, 1, 0)
print('Accuracy {}: {:.2f}%'.format('stem w/o SW', metrics.accuracy_score(test_labels, preds)*100))



Accuracy no text stripping w/ sw: 65.31%
Accuracy no text stripping w/o sw: 65.31%
Accuracy lemmatize w/SW: 65.31%
Accuracy lemmatize w/o SW: 65.31%
Accuracy stem w/ SW: 64.65%
Accuracy stem w/o SW: 64.65%


In [6]:
# preprocess data-> split normally
# set random seed
# preprocess data-> split normally
# set random seed

np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i))
df_.text = processed_full

# numtest = int(len(df_)/3.5)
# df_test = df_[:int(numtest/2)].reset_index(drop=True)
# df_dev = df_[int(numtest/2):numtest].reset_index(drop=True)
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_label = np.array(df_train.text), np.array(df_train.target)
dev_data, dev_label = np.array(df_dev.text), np.array(df_dev.target)
test_data, test_label = np.array(df_test.text), np.array(df_test.target)
# Naive Bayes example run, using non clustered data first.
# I'll use TF-IDF in this to vectorize data.

tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
dt_data = tfidf.transform(dev_data)
tt_data = tfidf.transform(test_data)
m_nb = MultinomialNB(alpha=0.9).fit(t_data, train_label) # best alpha from project 3
pred = m_nb.predict(dt_data)
pred_test = m_nb.predict(tt_data)
print('Test on no word root strip:')
print('F1 Score: {:.4f}'.format(metrics.f1_score(test_label, pred_test, average='weighted')))
print('Accuracy: {:.4f}'.format(metrics.accuracy_score(test_label, pred_test)))
print()

tokenizer = word_tokenize
np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

stop_words = set(stopwords.words('english'))
processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i,method='s',tokenizer=tokenizer))
df_.text = processed_full

# numtest = int(len(df_)/3.5)
# df_test = df_[:int(numtest/2)].reset_index(drop=True)
# df_dev = df_[int(numtest/2):numtest].reset_index(drop=True)
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_label = np.array(df_train.text), np.array(df_train.target)
dev_data, dev_label = np.array(df_dev.text), np.array(df_dev.target)
test_data, test_label = np.array(df_test.text), np.array(df_test.target)
# Naive Bayes example run, using non clustered data first.
# I'll use TF-IDF in this to vectorize data.

tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
dt_data = tfidf.transform(dev_data)
tt_data = tfidf.transform(test_data)
m_nb = MultinomialNB(alpha=0.9).fit(t_data, train_label) # best alpha from project 3
pred = m_nb.predict(dt_data)
pred_test = m_nb.predict(tt_data)
print('Test on Stemmatize:')
print('F1 Score: {:.4f}'.format(metrics.f1_score(test_label, pred_test, average='weighted')))
print('Accuracy: {:.4f}'.format(metrics.accuracy_score(test_label, pred_test)))

# preprocess data-> split normally
# set random seed

# # stem
# studies -> studi
# study -> stud
# # lemm
# studies -> study
# study -> study

tokenizer = word_tokenize

np.random.seed(0)
df_ = df.sample(frac=1).reset_index()

stop_words = set(stopwords.words('english'))
processed_full = []
for i in df_.text:
    processed_full.append(preprocess(i,method='l',tokenizer=tokenizer))
df_.text = processed_full

# numtest = int(len(df_)/3.5)
# df_test = df_[:int(numtest/2)].reset_index(drop=True)
# df_dev = df_[int(numtest/2):numtest].reset_index(drop=True)
numtest = int(len(df_)/5)
df_train = df_[numtest:].reset_index(drop=True)
df_test = df_[:numtest].reset_index(drop=True)

train_data, train_label = np.array(df_train.text), np.array(df_train.target)
dev_data, dev_label = np.array(df_dev.text), np.array(df_dev.target)
test_data, test_label = np.array(df_test.text), np.array(df_test.target)
# Naive Bayes example run, using non clustered data first.
# I'll use TF-IDF in this to vectorize data.

tfidf = CountVectorizer(ngram_range=(1,1))
t_data = tfidf.fit_transform(train_data)
dt_data = tfidf.transform(dev_data)
tt_data = tfidf.transform(test_data)
m_nb = MultinomialNB(alpha=0.9).fit(t_data, train_label) # best alpha from project 3
pred = m_nb.predict(dt_data)
pred_test = m_nb.predict(tt_data)
print()
print('Test on Lemmatize:')
print('F1 Score: {:.4f}'.format(metrics.f1_score(test_label, pred_test, average='weighted')))
print('Accuracy: {:.4f}'.format(metrics.accuracy_score(test_label, pred_test)))

NameError: name 'df_dev' is not defined

In [None]:
def cluster_boot(df, n_clusters=2):
    # Must accept pre-processed data as DF
    
    """ Pre-Cluster data before splitting to enhance generalization"""
    dat = df.text
    tfidf = TfidfVectorizer()
    t_data = tfidf.fit_transform(dat)
    # pca = 
    cluster = KMeans(n_clusters=n_clusters).fit(t_data) # really bad clustering for loo
    df['assign'] = cluster.labels_
    
    if n_clusters == 2:
        if len(df[df['assign']==1]) > len(df[df['assign']==0]):
            s, l = 0, 1
        else:
            s, l = 1, 0
        len_valid = int(len(df[df['assign']==s])/2)
        df_test = df[df['assign']==s][:len_valid].reset_index(drop=True)
        df_dev = df[df['assign']==s][len_valid:].reset_index(drop=True)
        df_train = df[df['assign']==l].reset_index(drop=True)
    else:
        groups = [] # list of two tuples of clusters
        centroids = cluster.cluster_centers_
    
    # Simple prelim:: Sparse matrix for Spectral Clustering
    # if cluster is more than 2 then use the majority of clusters
    # closest to each other as training set and the rest as dev/test
    # notes: this method did not work, the spectral takes too long for
    # this size of a sparse matrix.
    #     elif typ == 'cv':
    #         cv = CountVectorizer()
    #         t_data = cv.fit_transform(dat)
    #         cluster = SpectralClustering(n_clusters=2).fit(t_data)
    return (df_train, df_dev, df_test)

df_train,df_dev,df_test = cluster_boot(df_)
train_data, train_label = np.array(df_train.text), np.array(df_train.target)
dev_data, dev_label = np.array(df_dev.text), np.array(df_dev.target)
test_data, test_label = np.array(df_test.text), np.array(df_test.target)
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
dt_data = tfidf.transform(dev_data)
tt_data = tfidf.transform(test_data)
m_nb = MultinomialNB(alpha=0.9).fit(t_data, train_label) # best alpha from project 3
pred = m_nb.predict(dt_data)
pred_t = m_nb.predict(tt_data)
print('Metrics by cluster-splitting')
print('F1 Score: {:.4f}'.format(metrics.f1_score(dev_label, pred, average='weighted')))
print('Accuracy: {:.4f}'.format(metrics.accuracy_score(dev_label, pred)))


In [None]:
df_.text

In [None]:
# TESTING 
dat = df_.text
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(dat)
# not doing LSA, performing SVD is only for sake of cluster.. 
# due to very sparse data it is difficult to cluster on KMeans, 
# so we SVD I.E sparse->sparse to reduce dimensions
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
cluster = KMeans(n_clusters=2).fit(t_data) # really bad clustering for loo

df['assign'] = cluster.labels_
if len(df[df['assign']==1]) > len(df[df['assign']==0]):
    s, l = 0, 1
else:
    s, l = 1, 0
len_valid = int(len(df[df['assign']==s])/2)
df_test = df[df['assign']==s][:len_valid].reset_index(drop=True)
df_dev = df[df['assign']==s][len_valid:].reset_index(drop=True)
df_train = df[df['assign']==l].reset_index(drop=True)

train_data, train_label = np.array(df_train.text), np.array(df_train.target)
dev_data, dev_label = np.array(df_dev.text), np.array(df_dev.target)
test_data, test_label = np.array(df_test.text), np.array(df_test.target)
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(train_data)
dt_data = tfidf.transform(dev_data)
tt_data = tfidf.transform(test_data)
m_nb = MultinomialNB(alpha=0.9).fit(t_data, train_label) # best alpha from project 3
pred = m_nb.predict(dt_data)
pred_t = m_nb.predict(tt_data)
print('Metrics by cluster-splitting')
print('F1 Score: {:.4f}'.format(metrics.f1_score(dev_label, pred, average='weighted')))
print('Accuracy: {:.4f}'.format(metrics.accuracy_score(dev_label, pred)))


In [None]:
dat = df.text
tfidf = TfidfVectorizer()
t_data = tfidf.fit_transform(dat)
# not doing LSA, performing SVD is only for sake of cluster.. 
# due to very sparse data it is difficult to cluster on KMeans, 
# so we SVD I.E sparse->sparse to reduce dimensions
svd = TruncatedSVD()
t_data = svd.fit_transform(t_data)
cluster = KMeans(n_clusters=6).fit(t_data) # really bad clustering for loo
cluster.cluster_centers_

In [None]:
plt.figure(figsize=(15, 15))
plt.spy(t_data,markersize=1)

# reference
https://necromuralist.github.io/Neurotic-Networking/posts/nlp/01-twitter-preprocessing-with-nltk/