# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import random
import string
import re

# NLTK libraries
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

# Sci-kit libraries
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Data Loading

In [2]:
data = pd.read_csv("./Dataset/training.1600000.processed.noemoticon.csv")
data = data.sample(n=20000) # using random sample of the actual data

In [3]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
56962,0,1685546194,Sun May 03 00:53:07 PDT 2009,NO_QUERY,LanieSays,pretty exhausted but can't fall asleep. I ha...
555070,0,2203987444,Wed Jun 17 01:02:01 PDT 2009,NO_QUERY,poojalapasia,is finding Opera Unite very tempting but compl...
1194881,4,1984531724,Sun May 31 15:17:28 PDT 2009,NO_QUERY,PinkHeart27,on my way to see you Babe!!!!! LYM
568120,0,2207421605,Wed Jun 17 07:55:02 PDT 2009,NO_QUERY,iLenexLeNi,@Shishu95: HELP ME!!!!!! AGHHH. no freaking in...
854755,4,1573442612,Tue Apr 21 01:11:28 PDT 2009,NO_QUERY,alaverdyan,wondering why in the world did Suzanne and Jam...


In [4]:
data.shape

(20000, 6)

In [5]:
data.columns = ['target','ids','Date','flag','user','text']

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 56962 to 1333505
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  20000 non-null  int64 
 1   ids     20000 non-null  int64 
 2   Date    20000 non-null  object
 3   flag    20000 non-null  object
 4   user    20000 non-null  object
 5   text    20000 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.1+ MB


In [7]:
# Dropping unnessary features

data.drop(['ids','Date','flag','user'],axis=1,inplace = True)

In [8]:
data.target.value_counts()

0    10028
4     9972
Name: target, dtype: int64

## Data Cleaning

In [9]:
punctuations = string.punctuation

In [10]:
stop = stopwords.words('english')

In [11]:
# appending punctuations in stopwords

punctuations = [char for char in punctuations]
for char in punctuations:
    stop.append(char)

In [12]:
tokenizer = RegexpTokenizer(r'\w+') # only aplhabets 
ps = PorterStemmer()

In [13]:
def cleanWords(text):
    
    # lower the text message
    text = text.lower()
    
    # remove links
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',text)
    
     # remove usernames
    text = re.sub('@[^\s]+','',text) 
    
     # remove additional whitespaces
    text = re.sub('[\s]+', ' ', text)
    
    # Regex tokenizer
    text = tokenizer.tokenize(text)
    
    # Stopwords removal and Stemming using porter stemmer
    meaningful = [ps.stem(word) for word in text if not word in stop]
        

    return ' '.join(meaningful)

In [14]:
key = data['text'].keys()

In [15]:
# Cleaning all texts in dataFrame

for i in key:
    data['text'][i] = cleanWords(data['text'][i])
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'][i] = cleanWords(data['text'][i])


In [16]:
data.head(7)

Unnamed: 0,target,text
56962,0,pretti exhaust fall asleep dine enemi wolf she...
555070,0,find opera unit tempt complic
1194881,4,way see babe lym
568120,0,help aghhh freak internet whole 10 day mann im...
854755,4,wonder world suzann jame robertson write quot ...
1176703,4,standbi tre blog link blast sunday read pleasur
135256,0,ugh work comput serious broken get program use...


##  Data Splitting

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train , X_test , Y_train, Y_test = train_test_split(data, data['target'],test_size=0.2,random_state=0)

In [19]:
X_train.shape , X_test.shape

((16000, 2), (4000, 2))

In [20]:
X_train.head(5), Y_train.head(4)

(         target                                               text
 556468        0                                also repli bad hear
 329592        0  made dinner everyon went ikea eat iron made sw...
 1365925       4                           yeah down hour bit c goe
 221031        0                               sad see church later
 896574        4  sicker yesterday scream voic think go doctor s...,
 556468     0
 329592     0
 1365925    4
 221031     0
 Name: target, dtype: int64)

## Creating vocab and data formatting

In [22]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

In [23]:
tdidf = TfidfVectorizer(analyzer='word', max_features=2000, max_df = 0.8, ngram_range=(1,1))
X_train_vectorized = tdidf.fit_transform(X_train.text)
X_test_vectorized = tdidf.transform(X_test.text)

In [24]:
X_train_vectorized.shape, X_test_vectorized.shape

((16000, 2000), (4000, 2000))

# Model Selection

### <font color= 'red'>Logistic Regression </font>

In [25]:
logreg = LogisticRegression(C = 2.1, solver='liblinear', multi_class='auto')
logreg.fit(X_train_vectorized, Y_train)
Y_pred_lr = logreg.predict(X_test_vectorized)

cf_lr = classification_report(Y_pred_lr,Y_test)
score_lr = accuracy_score(Y_pred_lr,Y_test)

print(cf_lr)
print("Accuracy " ,score_lr)

              precision    recall  f1-score   support

           0       0.73      0.72      0.73      1996
           4       0.73      0.73      0.73      2004

    accuracy                           0.73      4000
   macro avg       0.73      0.73      0.73      4000
weighted avg       0.73      0.73      0.73      4000

Accuracy  0.7275


### <font color= 'red'>SVC </font>

In [26]:
svc = SVC()
svc.fit(X_train_vectorized, Y_train)
Y_pred_svc = svc.predict(X_test_vectorized)

cf_svc = classification_report(Y_pred_svc,Y_test)
score_svc = accuracy_score(Y_pred_svc,Y_test)
print(cf_svc)
print("Accuracy " , score_svc)

              precision    recall  f1-score   support

           0       0.72      0.72      0.72      1980
           4       0.72      0.72      0.72      2020

    accuracy                           0.72      4000
   macro avg       0.72      0.72      0.72      4000
weighted avg       0.72      0.72      0.72      4000

Accuracy  0.7215


### <font color= 'red'>Random Forest Classifier </font>

In [27]:
rf = RandomForestClassifier()
rf.fit(X_train_vectorized, Y_train)
Y_pred_rf = rf.predict(X_test_vectorized)

cf_rf = classification_report(Y_pred_rf,Y_test)
score_rf = accuracy_score(Y_pred_rf,Y_test)

print(cf_rf)
print("Accuracy " ,score_rf)

              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1944
           4       0.72      0.71      0.71      2056

    accuracy                           0.71      4000
   macro avg       0.71      0.71      0.71      4000
weighted avg       0.71      0.71      0.71      4000

Accuracy  0.707


### <font color= 'red'>Decision Tree Classifier </font>

In [28]:
dt = DecisionTreeClassifier()
dt.fit(X_train_vectorized, Y_train)
Y_pred_dt = dt.predict(X_test_vectorized)

cf_dt = classification_report(Y_pred_dt,Y_test)
score_dt = accuracy_score(Y_pred_dt,Y_test)
print(cf_dt)
print("Accuracy " ,score_dt)

              precision    recall  f1-score   support

           0       0.67      0.66      0.66      2005
           4       0.66      0.67      0.67      1995

    accuracy                           0.67      4000
   macro avg       0.67      0.67      0.67      4000
weighted avg       0.67      0.67      0.67      4000

Accuracy  0.66575
