In [1]:
import pandas as pd
import numpy as np
import re

import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet


from collections import Counter
from textblob import TextBlob

import pickle
import re

from scipy.sparse import vstack

## Tweets overview

In [2]:
def print_overview(path):
    df = pd.read_csv(path, header = None)
    df = df.transpose()
    for i in df.loc[:30, 0]:
        print(i)
    print('\n Describe:', df.describe())
    print('\n Len:', len(df.index))
    print('\n Count:', df.count()[0])
      
print('\t ***POSIRIVE*** \t')
print_overview('data/processedPositive.csv')
print('\n \t ***NEGATIVE*** \t') 
print_overview('data/processedNegative.csv')
print('\n \t ***NEUTRAL*** \t')
print_overview('data/processedNeutral.csv')

	 ***POSIRIVE*** 	
An inspiration in all aspects: Fashion
 fitness
 beauty and personality. :)KISSES TheFashionIcon
Apka Apna Awam Ka Channel Frankline Tv Aam Admi Production Please Visit Or Likes  Share :)Fb Page :...
Beautiful album from  the greatest unsung guitar genius of our time - and I've met the great backstage
Good luck to Rich riding for great project in this Sunday. Can you donate?
Omg he... kissed... him crying with joy
happy anniv ming and papi!!!!! love love happy
thanks happy
C'mon Tweeps
 Join  vote for the singer! Do spread the word. :D
Thanks for the great review! smile
Yay another art raffle! Everything you need to know is in the picture :D
Hello I hope you visit Luxor its amazing city in Egypt pleas check
We got a Vive tracker in the office and our intern
 went to work.Don't get too excited
 this isn't
Take a look at favourites.io You can do this and more happy
Go back to school for music! I think I will in time happy
Sixth spot not applicable Team! Higher pa! :)KI

In [3]:
'''
preprocessing
- remove nan
- concat rows to tweet
- remove duplication

Input: csv filename
Ountut: processing df

'''

def preprocessing(name_csv):
    df = pd.read_csv(name_csv, header = None)
    df.set_index(pd.Index(['tweet']), inplace = True)
    df = df.transpose()
    df.dropna()
    df['tweet'] = df['tweet'].apply(str)

    if len(df.index) < 1:
        return None
    return_df = pd.DataFrame()
    tweet = df.iloc[0, 0]
    for row in df.iloc[1:, 0]:
        if len(row) < 1:
            continue
        if row[0].isspace():
            tweet = (tweet.strip() + ' ' + row.strip()).strip()
        else:
            return_df = pd.concat([return_df, pd.DataFrame([tweet])], axis = 0)
            tweet = row
    return_df.columns = ['tweet']
    return_df.drop_duplicates(inplace=True, ignore_index=True)
    return(return_df)

In [4]:
def remove_nan_dupl(df, name):
    nan_value = float("NaN")
    df_tweet = df[name].replace("", nan_value)
    df_tweet.dropna(inplace = True)
    df_tweet.drop_duplicates(inplace=True)
    df = df.loc[df_tweet.index]
    df.index = list(range(len(df)))
    return df

def concatinate_all_tweets(pos, neg, neu):
    pos['sense'] = 1
    neg['sense'] = -1
    neu['sense'] = 0
    df = pd.concat([pos, neg, neu], axis = 0, ignore_index = True)
    return(df)

def to_lower(df):
    df['tweet'] = df['tweet'].str.lower()
    return(df)

def remove_punctuation(df):
    df['tweet'] = df['tweet'].str.replace('[^a-z_\s]','', regex=True)
    return(df)

def delete_stopwords(text):
    STOPWORDS = set(stopwords.words('english'))
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [5]:
with open('data/Emoticon_Dict.p', 'rb') as fp:
    Emoticon_Dict = pickle.load(fp)

def convert_emoticons_to_word(text):
    for k, v in Emoticon_Dict.items():
        emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in Emoticon_Dict) + u')')
        repl = ' ' + v.replace(' ', '_') + ' ';
        text = emoticon_pattern.sub(repl, text)
    return text

def check_most_frequent_words(df, count_words):
    cnt = Counter()
    for text in df["text_stop"].values:
        for word in text.split():
            cnt[word] += 1
    return(cnt.most_common(count_words))

def freqwords(text):
    cnt = check_most_frequent_words(df, 10)
    freq = set([w for (w, wc) in cnt])
    return " ".join([word for word in str(text).split() if word not in freq])

def tokenization(text):
    text = re.split('\W+', text)
    return text

def stem_words(text):
    stemmer = SnowballStemmer("english")
    word_tokens = tokenization(text)
    stem_list = list(map(stemmer.stem, word_tokens))
    return " ".join(stem_list)

def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV} 
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])


### ALL PREPROCCESSING

In [6]:
pos = preprocessing('data/processedPositive.csv')
neg = preprocessing('data/processedNegative.csv')
neu = preprocessing('data/processedNeutral.csv')


df = concatinate_all_tweets(pos, neg, neu)
df['tweet'] = df['tweet'].apply(convert_emoticons_to_word)
df = to_lower(df)
df = remove_punctuation(df)

df["stop"] = df["tweet"].apply(delete_stopwords)
df = remove_nan_dupl(df, "stop")
df["common"] = df["stop"]

'''
Processing required by subject:
'''

df["token"] = df['common'].apply(lambda x: tokenization(x.lower()))
df["token"] = df['token'].apply(lambda x: ' '.join(x))

df["stem"] = df["common"].apply(stem_words)

df["lemma"] = df["common"].apply(lemmatize_words)

df['misspellings'] = df['common'].apply(lambda x: str(TextBlob(x).correct()))

df['stem+misspellings'] = df['misspellings'].apply(stem_words)

df['lemmatize+misspellings'] = df['misspellings'].apply(lemmatize_words)

## Train-test split

In [7]:
from sklearn.model_selection import train_test_split

data = df.copy()

y = data['sense']

X = data[["token", "stem", "lemma", "misspellings", "stem+misspellings", "lemmatize+misspellings"]]
X_train_all, X_test_all, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

X_all = pd.concat([X_train_all, X_test_all], axis = 1)

print("Train data:", X_train_all.shape, y_train.shape)
print("Train data:", X_test_all.shape, y_test.shape)

Train data: (2118, 6) (2118,)
Train data: (530, 6) (530,)


## Transform to vectors

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def bow(X_train, X_test):
    vect = CountVectorizer(binary=True)
    X_train_bow = vect.fit_transform(X_train)
    X_test_bow = vect.transform(X_test)
    X_bow = vstack([X_train_bow, X_test_bow])
    return X_train_bow, X_test_bow, X_bow

def word_count(X_train, X_test):
    vect = CountVectorizer() 
    X_train_count = vect.fit_transform(X_train)
    X_test_count = vect.transform(X_test)
    X_count = vstack([X_train_count, X_test_count])
    return X_train_count, X_test_count, X_count

def tfidf(X_train, X_test):
    vect = TfidfVectorizer()
    X_train_tfidf = vect.fit_transform(X_train)
    X_test_tfidf = vect.transform(X_test)
    X_tfidf = vstack([X_train_tfidf, X_test_tfidf])
    return X_train_tfidf, X_test_tfidf, X_tfidf


In [9]:
vectorized_dict = {}

for process in X.columns:
    vectorized_key = "BOW_" + process
    vectorized_dict[vectorized_key] = list(bow(X_train_all[process], X_test_all[process]))

    vectorized_key = "WORDCOUNT_" + process
    vectorized_dict[vectorized_key] = list(word_count(X_train_all[process], X_test_all[process]))

    vectorized_key = "TFIDF_" + process
    vectorized_dict[vectorized_key] = list(tfidf(X_train_all[process], X_test_all[process]))

## Similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.sparse import coo_matrix
from scipy.sparse import tril

for k, v in vectorized_dict.items():
    print('\t\t', k.upper(), '\t\t')
    dataset = v[0]
    similarities_sparse = cosine_similarity(dataset, dense_output=False)
    processing_name = k.split('_')[1]
    coo = coo_matrix(similarities_sparse)
    coo.setdiag(0)
    coo = tril(coo, k = 1)
    coo.eliminate_zeros()
    tuples = list(zip(coo.row, coo.col, coo.data))
    top_10_numb = sorted(tuples, key = lambda x: x[2], reverse=True)[:10]
    for tweets_pair, numb in zip(top_10_numb, range(len(top_10_numb))):
        print("number = ", numb + 1)
        print(X_train_all[processing_name].iloc[tweets_pair[0]])
        print(X_train_all[processing_name].iloc[tweets_pair[1]])

		 BOW_TOKEN 		
number =  1
thanks happy
thanks b happy
number =  2
waited unhappy
waited u unhappy
number =  3
hi ashish tried call number got response unhappy please share another suitable time alternate cont
hi tried call number got response unhappy please share another suitable time alternate number cont
number =  4
hi tried call number got response unhappy please share another suitable time alternate number us cont
hi tried call number got response unhappy please share another suitable time alternate number cont
number =  5
hey thanks top new followers week much appreciated happy want
hey thanks top new followers week much appreciated happy
number =  6
thanks top engaged community members week happy want
thanks top engaged community members week happy want free
number =  7
share love thanks top new followers week happy
share love thanks top new followers week happy want
number =  8
thanks recent follow happy connect happy great thursday want
thanks recent follow happy connect happ

number =  1
hi try call number get response unhappy please share another suitable time alternate number u cont
hi try call number get response unhappy please share another suitable time alternate number cont
number =  2
wait unhappy
wait u unhappy
number =  3
thanks happy
thanks b happy
number =  4
hey thanks top new follower week much appreciate happy want
hey thanks top new follower week much appreciate happy
number =  5
thanks recent follow happy connect happy great wednesday
thanks recent follow happy connect happy great wednesday want
number =  6
thanks recent follow happy connect happy great thursday want
thanks recent follow happy connect happy great thursday
number =  7
share love thanks top new follower week happy
share love thanks top new follower week happy want
number =  8
thanks recent follow much appreciate happy
thanks recent follow much appreciate happy get
number =  9
thanks recent follow happy connect happy great thursday want free
thanks recent follow happy connect h

number =  1
hi try call number get response unhappy please share another suitable time alternate number u count
hi try call number get response unhappy please share another suitable time alternate number count
number =  2
thanks happy
thanks b happy
number =  3
wait unhappy
wait u unhappy
number =  4
thanks recent follow happy connect happy great thursday want
thanks recent follow happy connect happy great thursday want free
number =  5
thanks recent follow happy connect happy great wednesday
thanks recent follow happy connect happy great wednesday want
number =  6
thanks recent follow happy connect happy great thursday want
thanks recent follow happy connect happy great thursday
number =  7
hey thanks top new follower week much appreciate happy want
hey thanks top new follower week much appreciate happy
number =  8
thanks top engage community member week happy want
thanks top engage community member week happy want free
number =  9
share love thanks top new follower week happy
share l

## Machine Learning

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

## Choosing the best models

In [12]:
models = [ 
    ('LR', LogisticRegression()), 
    ('NB', MultinomialNB()), 
    ('DT', DecisionTreeClassifier()),
    ('RF', RandomForestClassifier())]

for i in vectorized_dict.keys():
    print(f'Vectorization type and preprocessing: {i}') 
    X_train = vectorized_dict[i][0]
    X_test = vectorized_dict[i][1]

    for name, model in models:
        clf = model
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(f'Model: {name}, ', end=' ')
        print("Test Accuracy:", accuracy_score(y_test, y_pred))
    print()

Vectorization type and preprocessing: BOW_token
Model: LR,  Test Accuracy: 0.9150943396226415
Model: NB,  Test Accuracy: 0.8943396226415095
Model: DT,  Test Accuracy: 0.9
Model: RF,  Test Accuracy: 0.9245283018867925

Vectorization type and preprocessing: WORDCOUNT_token
Model: LR,  Test Accuracy: 0.9169811320754717
Model: NB,  Test Accuracy: 0.8981132075471698
Model: DT,  Test Accuracy: 0.9018867924528302
Model: RF,  Test Accuracy: 0.9283018867924528

Vectorization type and preprocessing: TFIDF_token
Model: LR,  Test Accuracy: 0.9245283018867925
Model: NB,  Test Accuracy: 0.8924528301886793
Model: DT,  Test Accuracy: 0.9056603773584906
Model: RF,  Test Accuracy: 0.9207547169811321

Vectorization type and preprocessing: BOW_stem
Model: LR,  Test Accuracy: 0.9169811320754717
Model: NB,  Test Accuracy: 0.9075471698113208
Model: DT,  Test Accuracy: 0.9075471698113208
Model: RF,  Test Accuracy: 0.9226415094339623

Vectorization type and preprocessing: WORDCOUNT_stem
Model: LR,  Test Accura

## Grid search LogisticRegression

In [13]:
for i in vectorized_dict.keys():
    print(i)
    X_train = vectorized_dict[i][0]
    X_test = vectorized_dict[i][1]
    
    grid = [{'solver' : ['lbfgs','newton-cg','liblinear'],
            'max_iter' : [100, 1000, 2500, 5000] 
            }]

    logreg = LogisticRegression()
    logreg_cv = GridSearchCV(logreg, grid, cv=10)
    logreg_cv.fit(X_train, y_train)

    print("tuned hyperparameters : (best parameters) ", logreg_cv.best_params_)
    print("accuracy :", logreg_cv.best_score_)

BOW_token
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9216310471251005
WORDCOUNT_token
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9206876508986854
TFIDF_token
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9117142090673344
BOW_stem
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9206809442904408
WORDCOUNT_stem
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9187986229097737
TFIDF_stem
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9112425109541269
BOW_lemma
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9216310471251005
WORDCOUNT_lemma
tuned hyperparameters : (best parameters)  {'max_iter': 100, 'solver': 'lbfgs'}
accuracy : 0.9192725565590628
TFIDF_lemma
tuned hyp

In [14]:
accuracy_final_logreg = []
for i in vectorized_dict.keys():

    X_train = vectorized_dict[i][0]
    X_test = vectorized_dict[i][1]

    clf = LogisticRegression(solver = 'lbfgs', max_iter = 100)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    accuracy_final_logreg.append(score)

In [15]:
processing = ['just tokenization', 
                'Stemming', 
                'Lemmatization', 
                'Misspellings', 
                'Stemming+misspellings', 
                'Lemmatization+misspellings']

accuracy_final_logreg = np.array(accuracy_final_logreg).reshape([6, 3])
accuracy_final_logreg = pd.DataFrame(accuracy_final_logreg, columns=['0 or 1, if the word exists', 'word counts', 'TFIDF'])
accuracy_final_logreg.insert(0, 'processing', processing, allow_duplicates = False)
accuracy_final_logreg.set_index('processing')

Unnamed: 0_level_0,"0 or 1, if the word exists",word counts,TFIDF
processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
just tokenization,0.915094,0.916981,0.924528
Stemming,0.916981,0.920755,0.926415
Lemmatization,0.916981,0.915094,0.916981
Misspellings,0.920755,0.920755,0.926415
Stemming+misspellings,0.924528,0.926415,0.928302
Lemmatization+misspellings,0.920755,0.918868,0.926415


## Grid search RandomForest

In [16]:
for i in vectorized_dict.keys():
    print(i)
    X_train = vectorized_dict[i][0]
    X_test = vectorized_dict[i][1]
    
    grid = {'n_estimators': [5, 10, 50, 100]}
    
    model = RandomForestClassifier(random_state=21)
    grid_search = GridSearchCV(model, grid, cv=10)
    grid_search.fit(X_train, y_train)

    print("tuned hyperparameters :(best parameters) ", grid_search.best_params_)
    print("accuracy :", grid_search.best_score_)
    print()

BOW_token
tuned hyperparameters :(best parameters)  {'n_estimators': 100}
accuracy : 0.9192725565590628

WORDCOUNT_token
tuned hyperparameters :(best parameters)  {'n_estimators': 100}
accuracy : 0.9178574622194402

TFIDF_token
tuned hyperparameters :(best parameters)  {'n_estimators': 100}
accuracy : 0.9192747920951444

BOW_stem
tuned hyperparameters :(best parameters)  {'n_estimators': 50}
accuracy : 0.9140816417776982

WORDCOUNT_stem
tuned hyperparameters :(best parameters)  {'n_estimators': 100}
accuracy : 0.9131315389430386

TFIDF_stem
tuned hyperparameters :(best parameters)  {'n_estimators': 100}
accuracy : 0.9154922650451576

BOW_lemma
tuned hyperparameters :(best parameters)  {'n_estimators': 100}
accuracy : 0.9140816417776982

WORDCOUNT_lemma
tuned hyperparameters :(best parameters)  {'n_estimators': 50}
accuracy : 0.9136121792005722

TFIDF_lemma
tuned hyperparameters :(best parameters)  {'n_estimators': 100}
accuracy : 0.9188053295180183

BOW_misspellings
tuned hyperparamete

## RandomForest best params

In [17]:
accuracy_final = []

for i in vectorized_dict.keys():

    X_train = vectorized_dict[i][0]
    X_test = vectorized_dict[i][1]

    clf = RandomForestClassifier(n_estimators = 100)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    accuracy_final.append(score)

In [18]:
processing = ['just tokenization', 
                'Stemming', 
                'Lemmatization', 
                'Misspellings', 
                'Stemming+misspellings', 
                'Lemmatization+misspellings']

result = np.array(accuracy_final).reshape([6, 3])
result = pd.DataFrame(result, columns=['0 or 1, if the word exists', 'word counts', 'TFIDF'])
result.insert(0, 'processing', processing, allow_duplicates = False)
result.set_index('processing')

Unnamed: 0_level_0,"0 or 1, if the word exists",word counts,TFIDF
processing,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
just tokenization,0.922642,0.924528,0.920755
Stemming,0.930189,0.922642,0.920755
Lemmatization,0.924528,0.924528,0.924528
Misspellings,0.913208,0.922642,0.926415
Stemming+misspellings,0.928302,0.932075,0.922642
Lemmatization+misspellings,0.916981,0.922642,0.924528


## Word2Vec

In [19]:
from gensim.models import Word2Vec

size = 20
window = 3
min_count = 15
workers = 3
sg = 1
batch_words = 50
X_train_tokens = X_train_all['lemma'].apply(tokenization)
X_test_tokens = X_test_all['lemma'].apply(tokenization)
X_tokens = X['lemma'].apply(tokenization)
tokens = X_tokens.values
w2v_model = Word2Vec(tokens, batch_words = batch_words, min_count = min_count, vector_size = size, workers = workers, window = window, sg = sg)
print("Similar to happy_face_or_smiley:")
print(w2v_model.wv.most_similar('happy_face_or_smiley'))

def mean_vectors(tokens):
    vec = np.zeros(size)
    count = 0
    for word in tokens:
        try:
            vec += w2v_model.wv[word]
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

train_word2vec_matrix = np.zeros(([len(X_train_tokens), size]))
for i in range(len(X_train_tokens)):
    train_word2vec_matrix[i, :] = mean_vectors(X_train_tokens.iloc[i])
X_train_word2vec = pd.DataFrame(train_word2vec_matrix)

test_word2vec_matrix = np.zeros(([len(X_test_tokens), size]))
for i in range(len(X_test_tokens)):
    test_word2vec_matrix[i, :] = mean_vectors(X_test_tokens.iloc[i])
X_test_word2vec = pd.DataFrame(test_word2vec_matrix)
print("\nShapes of Train and Test w2v datasets:")
print(X_train_word2vec.shape, X_test_word2vec.shape)

Similar to happy_face_or_smiley:
[('give', 0.9925644993782043), ('get', 0.9920628666877747), ('make', 0.9917730689048767), ('ill', 0.9912694096565247), ('people', 0.9909273386001587), ('week', 0.9906089305877686), ('sad', 0.9903519749641418), ('top', 0.9902238249778748), ('read', 0.9899414777755737), ('election', 0.9897988438606262)]

Shapes of Train and Test w2v datasets:
(2118, 20) (530, 20)


In [20]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_word2vec, y_train)

y_pred = clf.predict(X_test_word2vec)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.809433962264151
