In [None]:
~watermark

### Import Libraries

In [83]:
import pandas as pd
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
stopwords = set((stopwords.words("english")))

#### Dataset

In [84]:
data = pd.read_csv("tweets_data.csv")

In [85]:
tweet = pd.read_csv("tweets_data.csv", usecols = ["label", "tweet"])
tweet.head(10)
#tweet.shape

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
5,0,[2/2] huge fan fare and big talking before the...
6,0,@user camping tomorrow @user @user @user @use...
7,0,the next school year is the year for exams.ð...
8,0,we won!!! love the land!!! #allin #cavs #champ...
9,0,@user @user welcome here ! i'm it's so #gr...


In [86]:
# Create test and train sets

from sklearn.model_selection import train_test_split
tweet_train, tweet_Test, label_train, label_Test = train_test_split(tweet.tweet, tweet.label, test_size = 0.2)

In [87]:
tweet_train.shape, tweet_Test.shape

((25569,), (6393,))

### Text Preprocessing

In [88]:
def to_lower(word):
    result = word.lower()
    return result

def remove_hyperlink(word):
    return  re.sub(r"http\S+", "", word)

def remove_mentions(word):
    return re.sub(r"@\S+", "", word)

def remove_number(word):
    result = re.sub(r'\d+', '', word)
    return result

def remove_punctuation(word):
    result = re.sub('[^A-Za-z]+', ' ', word)
    return result

def remove_whitespace(word):
    result = word.strip()
    return result

def replace_newline(word):
    return word.replace('\n','')

def remove_stopwords(word):
    return ' '.join(word for word in i.split() if word not in stopwords)



def clean_up_pipeline(sentence):
    cleaning_data = [remove_hyperlink,
                      replace_newline,
                      to_lower,
                      remove_number,
                      remove_punctuation,
                      remove_whitespace]
    for func in cleaning_data:
        
        sentence = func(sentence)
    return sentence

print(tweet_train)
tweet_train = tweet_train.apply(clean_up_pipeline)

print(tweet_train)

21626    possible future:the year is 2100 and we advanc...
21373    #suicide #depression    i wonder if suicide is...
27243    why do you always try to make me happy?  i don...
23195    sometimes i'm just sad, soppy and gloopy, noth...
963      yesterday @ ozen ð  #restaurant   #eatallyo...
                               ...                        
14818    derventio is booked to play for a #wedding at ...
3433     #wisconsin   bull hill climb: you have to reac...
29841    @user our global media platform would very muc...
14609    christina grimmie of 'voice' fame shot dead af...
3624                        @user @user   bihday milesð
Name: tweet, Length: 25569, dtype: object
21626    possible future the year is and we advanced so...
21373    suicide depression i wonder if suicide is the ...
27243    why do you always try to make me happy i dont ...
23195    sometimes i m just sad soppy and gloopy nothin...
963                 yesterday ozen restaurant eatallyoucan
              

### Tokenization

In [89]:
from nltk.tokenize import word_tokenize
tweet_train = tweet_train.apply(word_tokenize)

In [90]:
tweet_train

21626    [possible, future, the, year, is, and, we, adv...
21373    [suicide, depression, i, wonder, if, suicide, ...
27243    [why, do, you, always, try, to, make, me, happ...
23195    [sometimes, i, m, just, sad, soppy, and, gloop...
963            [yesterday, ozen, restaurant, eatallyoucan]
                               ...                        
14818    [derventio, is, booked, to, play, for, a, wedd...
3433     [wisconsin, bull, hill, climb, you, have, to, ...
29841    [user, our, global, media, platform, would, ve...
14609    [christina, grimmie, of, voice, fame, shot, de...
3624                           [user, user, bihday, miles]
Name: tweet, Length: 25569, dtype: object

### Stemming

In [91]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text])
tweet_train = tweet_train.apply(lambda text: stem_words(text))

tweet_train

21626    possibl futur the year is and we advanc so far...
21373    suicid depress i wonder if suicid is the best ...
27243    whi do you alway tri to make me happi i dont k...
23195    sometim i m just sad soppi and gloopi noth goe...
963                    yesterday ozen restaur eatallyoucan
                               ...                        
14818    derventio is book to play for a wed at the lov...
3433     wisconsin bull hill climb you have to reach th...
29841    user our global media platform would veri much...
14609    christina grimmi of voic fame shot dead after ...
3624                                 user user bihday mile
Name: tweet, Length: 25569, dtype: object

### Cleaning of Raw Data

#### Lowercase

In [92]:
tweet.tweet = tweet.tweet.str.lower()

tweet

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
...,...,...
31957,0,ate @user isz that youuu?ðððððð...
31958,0,to see nina turner on the airwaves trying to...
31959,0,listening to sad songs on a monday morning otw...
31960,1,"@user #sikh #temple vandalised in in #calgary,..."


#### Remove "@" Mentions

In [93]:
p = []

for i in tweet.tweet:
    p.append(re.sub(r"@\S+", "", i))
tweet.tweet = p


tweet

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for #lyft credit i can't use cause th...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
...,...,...
31957,0,ate isz that youuu?ððððððð...
31958,0,to see nina turner on the airwaves trying to...
31959,0,listening to sad songs on a monday morning otw...
31960,1,"#sikh #temple vandalised in in #calgary, #wso..."


#### Remove URLs

In [94]:
p = []

for i in tweet.tweet:
    p.append(re.sub(r"http\S+", "", i))
tweet.tweet = p

tweet

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so sel...
1,0,thanks for #lyft credit i can't use cause th...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
...,...,...
31957,0,ate isz that youuu?ððððððð...
31958,0,to see nina turner on the airwaves trying to...
31959,0,listening to sad songs on a monday morning otw...
31960,1,"#sikh #temple vandalised in in #calgary, #wso..."


#### Remove any special characters

In [95]:
p = []

for sentence in tweet.tweet:
    p.append(re.sub('[^A-Za-z]+', ' ', sentence))
tweet.tweet = p

tweet

Unnamed: 0,label,tweet
0,0,when a father is dysfunctional and is so self...
1,0,thanks for lyft credit i can t use cause they...
2,0,bihday your majesty
3,0,model i love u take with u all the time in ur
4,0,factsguide society now motivation
...,...,...
31957,0,ate isz that youuu
31958,0,to see nina turner on the airwaves trying to ...
31959,0,listening to sad songs on a monday morning otw...
31960,1,sikh temple vandalised in in calgary wso cond...


#### Removing stopwords

In [98]:
no_stopwords=[]
for i in tweet.tweet:
    no_stopwords.append(' '.join(word for word in i.split() if word not in stopwords))
tweet.tweet = no_stopwords
tweet

Unnamed: 0,label,tweet
0,0,father dysfunctional selfish drags kids dysfun...
1,0,thanks lyft credit use cause offer wheelchair ...
2,0,bihday majesty
3,0,model love u take u time ur
4,0,factsguide society motivation
...,...,...
31957,0,ate isz youuu
31958,0,see nina turner airwaves trying wrap mantle ge...
31959,0,listening sad songs monday morning otw work sad
31960,1,sikh temple vandalised calgary wso condemns act


### Word Embedding

In [99]:
import gensim

tokenize=tweet_train.apply(lambda x: x.split())
w2vec_model=gensim.models.Word2Vec(tokenize,min_count = 1, size = 100, window = 5, sg = 1)
w2vec_model.train(tokenize,total_examples= len(tweet_train),epochs=20)

(5335122, 6637140)

In [100]:
from sklearn.feature_extraction.text import CountVectorizer

row=CountVectorizer( min_df=2, max_features=1000)
row.fit(tweets_train)
row_df=row.transform(tweets_train).toarray()

In [101]:
row_df.shape

(25569, 1000)

## Model

In [102]:
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
import numpy as np
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

In [103]:
label_train = np.array(label_train)

In [104]:
lreg = LogisticRegression()
lreg.fit(row_df, np.array(label_train)) # training the model

prediction = lreg.predict_proba(row_df) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

f1_score(label_train, prediction_int)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.03512506652474721

In [105]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

cf_matrix =  confusion_matrix(label_train, prediction_int)

tn, fp, fn, tp = confusion_matrix(label_train, prediction_int).ravel()

# Create the 
print("Precision: {:.2f}%".format(100 * precision_score(label_train, prediction_int)))
print("Recall: {:.2f}%".format(100 * recall_score(label_train, prediction_int)))
print("F1 Score: {:.2f}%".format(100 * f1_score(label_train,prediction_int)))


Precision: 45.21%
Recall: 1.83%
F1 Score: 3.51%


In [106]:
# Try several models

model_list = [LogisticRegression(), GaussianNB(), SVC(kernel = 'linear', probability = True)]

In [None]:
for model in model_list:
    model.fit(row_df, np.array(label_train))
    prediction = model.predict_proba(row_df) # predicting on the validation set
    prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 then 1, else 0
    prediction_int = prediction_int.astype(np.int)

    f1score=f1_score(label_train, prediction_int) 
    print(f1score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.03512506652474721
0.15368526349292955


#### Data Visualization

In [None]:
# Use a confusion matrix to visualize model performance

import seaborn as sns
import matplotlib.pyplot as plt

ax= plt.subplot()
#annot=True to annotate cells
sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt='');

# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels');
ax.set_title('Confusion Matrix');
ax.xaxis.set_ticklabels(['Positive', 'Negative']); ax.yaxis.set_ticklabels(['Positive', 'Negative']);