In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [16]:
data = pd.read_csv('../train.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [17]:
data = data.drop(['id'], axis = 1)
data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [3]:
# remove unwanted text patterns from the tweets
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt
data['Tweets'] = np.vectorize(remove_pattern)(data['tweet'], "@[\w]*")
data['Tweets'] = data['Tweets'].str.replace("[^a-zA-Z#]", " ")

# Removing Short Words
data['Tweets'] = data['Tweets'].apply(lambda x : ' '.join([w for w in x.split() if len(w) > 3]))
tokenized_tweet = data['Tweets'].apply(lambda x : x.split())
from nltk.stem.porter import * 
stemmer = PorterStemmer() 
tokenized_tweet = tokenized_tweet.apply(lambda x : [stemmer.stem(i) for i in x])
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
data['Tweets'] = tokenized_tweet
data.head()

Unnamed: 0,label,tweet,Tweets
0,0,@user when a father is dysfunctional and is s...,when father dysfunct selfish drag kid into dys...
1,0,@user @user thanks for #lyft credit i can't us...,thank #lyft credit caus they offer wheelchair ...
2,0,bihday your majesty,bihday your majesti
3,0,#model i love u take with u all the time in ...,#model love take with time
4,0,factsguide: society now #motivation,factsguid societi #motiv


In [4]:
from sklearn.utils import resample
df_majority = data[data.label == 0]
df_minority = data[data.label == 1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace = True,     # sample with replacement
                                 n_samples = len(df_majority),
                                 random_state = 123)
 
# Combine majority class with upsampled minority class
data = pd.concat([df_majority, df_minority_upsampled])
data.label.value_counts()

0    29720
1    29720
Name: label, dtype: int64

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# using countvectorizer

In [6]:
# using CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')
x_train, x_test, y_train, y_test = train_test_split(data['Tweets'], data['label'], test_size = 0.3, random_state = 42)

train = cv.fit_transform(x_train)  #train the vectorizer, build the vocablury
test = cv.transform(x_test)  #get same encodings on test data as of vocabulary built

In [7]:
# Random Forest
rf = RandomForestClassifier()
rf.fit(train, y_train)
predicted = rf.predict(test)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8695  191]
 [  18 8928]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      8886
           1       0.98      1.00      0.99      8946

    accuracy                           0.99     17832
   macro avg       0.99      0.99      0.99     17832
weighted avg       0.99      0.99      0.99     17832



In [8]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
predicted = dt.predict(test)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8402  484]
 [  18 8928]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      8886
           1       0.95      1.00      0.97      8946

    accuracy                           0.97     17832
   macro avg       0.97      0.97      0.97     17832
weighted avg       0.97      0.97      0.97     17832



In [9]:
# KNN
kn = KNeighborsClassifier()
kn.fit(train, y_train)
predicted = kn.predict(test)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8183  703]
 [  20 8926]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      8886
           1       0.93      1.00      0.96      8946

    accuracy                           0.96     17832
   macro avg       0.96      0.96      0.96     17832
weighted avg       0.96      0.96      0.96     17832



In [10]:
# Logistic Regression
lg = LogisticRegression()
lg.fit(train, y_train)
predicted = lg.predict(test)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8526  360]
 [ 115 8831]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8886
           1       0.96      0.99      0.97      8946

    accuracy                           0.97     17832
   macro avg       0.97      0.97      0.97     17832
weighted avg       0.97      0.97      0.97     17832



# using TfidfVectorizer

In [11]:
# using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(stop_words = 'english')
train1 = tf.fit_transform(x_train)  #train the vectorizer, build the vocablury
test1 = tf.transform(x_test)  #get same encodings on test data as of vocabulary built

In [12]:
# Random Forest
rf1 = RandomForestClassifier()
rf1.fit(train1, y_train)
predicted1 = rf1.predict(test1)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8526  360]
 [ 115 8831]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8886
           1       0.96      0.99      0.97      8946

    accuracy                           0.97     17832
   macro avg       0.97      0.97      0.97     17832
weighted avg       0.97      0.97      0.97     17832



In [13]:
# Decision Tree
dt1 = DecisionTreeClassifier()
dt1.fit(train1, y_train)
predicted1 = dt1.predict(test1)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8526  360]
 [ 115 8831]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8886
           1       0.96      0.99      0.97      8946

    accuracy                           0.97     17832
   macro avg       0.97      0.97      0.97     17832
weighted avg       0.97      0.97      0.97     17832



In [14]:
# KNN
kn1 = KNeighborsClassifier()
kn1.fit(train1, y_train)
predicted1 = kn1.predict(test1)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8526  360]
 [ 115 8831]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8886
           1       0.96      0.99      0.97      8946

    accuracy                           0.97     17832
   macro avg       0.97      0.97      0.97     17832
weighted avg       0.97      0.97      0.97     17832



In [15]:
# Logistic Regression
lg1 = LogisticRegression()
lg1.fit(train1, y_train)
predicted1 = lg1.predict(test1)
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

[[8526  360]
 [ 115 8831]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      8886
           1       0.96      0.99      0.97      8946

    accuracy                           0.97     17832
   macro avg       0.97      0.97      0.97     17832
weighted avg       0.97      0.97      0.97     17832

