In [1]:
# importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

In [2]:
# Importing the dataset
dataset = pd.read_csv('train_E6oV3lV.csv')
dataset.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
# browsing through the tweets

dataset.head().tweet.values

array([' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
       "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
       '  bihday your majesty',
       '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
       ' factsguide: society now    #motivation'], dtype=object)

The tweeets contain a lot of symbols (@#\ etc) which do no provide us with any new information. Also it becomes difficult for the machine to distinguish words such as _dysfunctional_ and _dysfunction_ which have essentially the same meaning. So we need to clean the text to make it standard and recognizable to machines

In [4]:
# Cleaning the texts
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []

In [5]:
for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['tweet'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
# reviewing the new corpus of words formed
corpus[:5]

['user father dysfunct selfish drag kid dysfunct run',
 'user user thank lyft credit use caus offer wheelchair van pdx disapoint getthank',
 'bihday majesti',
 'model love u take u time ur',
 'factsguid societi motiv']

Thus we have a corpus of stemmed words, which can now be used to make predictions on the dataset

Now, I will try multiple models and use the best of them to make predictions

# Bag of words model

In [7]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000) 
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
X.shape

(31962, 5000)

In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [9]:
def getscores(classifier):
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    print('confusion matrix = ',cm)
    print('f1 score = ' , f1_score(y_test,y_pred))
    return 0

In [10]:
# Naive Bayes classifier

from sklearn.naive_bayes import GaussianNB
classifierNB = GaussianNB()
classifierNB.fit(X_train, y_train)

getscores(classifierNB)

confusion matrix =  [[4503 1482]
 [ 121  287]]
f1 score =  0.2636655948553055


In [11]:
# logistic Regression

from sklearn.linear_model import LogisticRegression
classifierLR = LogisticRegression(random_state = 0)
classifierLR.fit(X_train, y_train)

getscores(classifierLR)



confusion matrix =  [[5931   54]
 [ 205  203]]
f1 score =  0.6105263157894737


A higher F1 score corresponds to a better model, hence we want the F1 score of our model closest to 1.

We can also try and use the ***textblob*** library which provides some functions to make predictions on sentences and a built-in function for sentiment analysis

In [12]:
from textblob import TextBlob

In [13]:
dataset['sentiment'] = dataset['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )

In [14]:
dataset['senti'] = dataset['sentiment'].apply(lambda x: 0 if x>0 else 1)
dataset.head(20)

Unnamed: 0,id,label,tweet,sentiment,senti
0,1,0,@user when a father is dysfunctional and is s...,-0.5,1
1,2,0,@user @user thanks for #lyft credit i can't us...,0.2,0
2,3,0,bihday your majesty,0.0,1
3,4,0,#model i love u take with u all the time in ...,0.976562,0
4,5,0,factsguide: society now #motivation,0.0,1
5,6,0,[2/2] huge fan fare and big talking before the...,0.2,0
6,7,0,@user camping tomorrow @user @user @user @use...,0.0,1
7,8,0,the next school year is the year for exams.ð...,-0.4,1
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,0.976562,0
9,10,0,@user @user welcome here ! i'm it's so #gr...,1.0,0


In [15]:
print('confusion matrix = ',confusion_matrix(dataset['senti'], dataset['label']))
print('f1 score = ' , f1_score(dataset['senti'], dataset['label']))

confusion matrix =  [[15279   602]
 [14441  1640]]
f1 score =  0.17900998744747038


As we can see the score is pretty low so we'll not go with this approach