# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import random

# Data Loading

In [2]:
data = pd.read_csv("./Dataset/training.1600000.processed.noemoticon.csv")
data = data.sample(n=10000)

In [3]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
263917,0,1986693919,Sun May 31 19:11:17 PDT 2009,NO_QUERY,HAZZARD202,i need updates on the awards
917313,4,1753467101,Sun May 10 01:47:03 PDT 2009,NO_QUERY,celinehlbk,@ShanteRowlandd so how is the most wonderful m...
1297366,4,2004489309,Tue Jun 02 08:16:50 PDT 2009,NO_QUERY,hyper_baffoon,i think im addicted to these traffic light ice...
1103708,4,1971075046,Sat May 30 06:08:24 PDT 2009,NO_QUERY,bengarrett77,@dvsboy you know foyles has a decent bookshop ...
908229,4,1696133692,Mon May 04 07:40:45 PDT 2009,NO_QUERY,JayeMarie,@sheila_H ok so who do you know on Daisy of Lo...


In [4]:
data.shape

(10000, 6)

In [5]:
data.columns = ['target','ids','Date','flag','user','text']

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 263917 to 936728
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  10000 non-null  int64 
 1   ids     10000 non-null  int64 
 2   Date    10000 non-null  object
 3   flag    10000 non-null  object
 4   user    10000 non-null  object
 5   text    10000 non-null  object
dtypes: int64(2), object(4)
memory usage: 546.9+ KB


In [7]:
# Dropping unnessary features

data.drop(['ids','Date','flag','user'],axis=1,inplace = True)

In [8]:
data.target.value_counts()

4    5111
0    4889
Name: target, dtype: int64

## Data Cleaning

In [9]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

In [10]:
punctuations = string.punctuation

In [11]:
stop = stopwords.words('english')

In [12]:
# appending punctuations in stopwords
punctuations = [char for char in punctuations]
for char in punctuations:
    stop.append(char)

In [13]:
tokenizer = RegexpTokenizer(r'\w+')
ps = PorterStemmer()

In [14]:
def cleanWords(text):
    
    # lower the text message
    text = text.lower()
    
    # Regex tokenizer
    text = tokenizer.tokenize(text)
    
    # Stopwords removal and Stemming using porter stemmer
    meaningful = [ps.stem(word) for word in text if not word in stop]
        

    return ' '.join(meaningful)

In [15]:
key = data['text'].keys()

In [16]:
for i in key:
    data['text'][i] = cleanWords(data['text'][i])
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'][i] = cleanWords(data['text'][i])


In [17]:
data.head(7)

Unnamed: 0,target,text
263917,0,need updat award
917313,4,shanterowlandd wonder mommi world
1297366,4,think im addict traffic light ice lolli sugar ...
1103708,4,dvsboy know foyl decent bookshop westfield lon...
908229,4,sheila_h ok know daisi love seen yet oh lord
1317724,4,love know dont worri get
1194754,4,cuppi time spare play darkfal person


#  Vectorizer

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv = CountVectorizer(max_features=2000)

In [20]:
data_vectorized = cv.fit_transform(data['text']).toarray()

In [21]:
data_vectorized

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

##  Data Splitting

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train , X_test , Y_train, Y_test = train_test_split(data_vectorized, data['target'],test_size=0.2,random_state=0)

In [24]:
X_train.shape , X_test.shape

((8000, 2000), (2000, 2000))

In [25]:
X_train, Y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 922194     4
 585073     0
 372483     0
 998024     4
 224565     0
           ..
 1108896    4
 468491     0
 897351     4
 700235     0
 1517332    4
 Name: target, Length: 8000, dtype: int64)

# Model Selection

In [29]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [30]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
svc = SVC()
svc.fit(X_train,Y_train)
Y_pred_svc = svc.predict(X_test_vectorized)
cf_svc = classification_report(Y_pred_svc,Y_test)
score_svc = accuracy_score(Y_pred_svc,Y_test)
print(cf_svc)
print(score_svc)

In [None]:
rf = RandomForestClassifier(n_estimators = 10)
rf.fit(X_train,Y_train)
Y_pred_rf = rf.predict(X_test_vectorized)
cf_rf = classification_report(Y_pred_rf,Y_test)
score_rf = accuracy_score(Y_pred_rf,Y_test)
print(cf_rf)
print(score_rf)

In [None]:
dt = DecisionTreeClassifier( criterion='entropy')
dt.fit(X_train,Y_train)
Y_pred_dt = dt.predict(X_test_vectorized)
cf_dt = classification_report(Y_pred_dt,Y_test)
score_dt = accuracy_score(Y_pred_dt,Y_test)
print(cf_dt)
print(score_dt)