# Text classification

In [1]:
import pandas as pd

In [2]:
train  = pd.read_csv('train.csv',encoding='iso-8859-1')

In [3]:
train = train.drop('ItemID',axis=1)
train.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [4]:
def clean_tweet(x):
    from bs4 import BeautifulSoup
    import re
    clean = BeautifulSoup(x, 'lxml')
    x = clean.get_text()#html decoding
    x = re.sub(r'@[A-Za-z0-9]+','',x)#remove @mention
    x = re.sub('https?://[A-Za-z0-9./]+','',x)#remove html links
    x = re.sub("[^a-zA-Z]", " ", x)#remove punctuations/numbers
    return x

In [5]:
train_clean = pd.DataFrame()
train_clean['text'] = train['SentimentText'].apply(clean_tweet)
train_clean['Sentiment'] = train['Sentiment']

In [6]:
train_clean.head()

Unnamed: 0,text,Sentiment
0,is so sad for my APL friend,0
1,I missed the New Moon trailer,0
2,omg its already O,1
3,Omgaga Im sooo im gunna CRy I ve been at...,0
4,i think mi bf is cheating on me T T,0


In [7]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
train_clean['text'] = train_clean['text'].apply(lambda x:" ".join([wnl.lemmatize(i) for i in x.split(' ')]))

In [8]:
#converting to lower case
train_clean['text'] = train_clean['text'].apply(lambda x:" ".join([i.lower() for i in x.split(' ')]))

In [9]:
train_clean.head()

Unnamed: 0,text,Sentiment
0,is so sad for my apl friend,0
1,i missed the new moon trailer,0
2,omg it already o,1
3,omgaga im sooo im gunna cry i ve been at...,0
4,i think mi bf is cheating on me t t,0


### TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train_clean['text'])

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_vect,train_clean['Sentiment'], test_size=0.3, random_state=42)

In [10]:
from sklearn.svm import LinearSVC
model1 = LinearSVC()
model1.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(model1.predict(X_train),y_train)

0.7384843982169391

In [12]:
accuracy_score(model1.predict(X_test),y_test)

0.727972797279728

In [13]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB()
model2.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
accuracy_score(model2.predict(X_train),y_train)

0.7309121042404846

In [15]:
accuracy_score(model2.predict(X_test),y_test)

0.7236056939027236

In [16]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression()
model3.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
accuracy_score(model3.predict(X_train),y_train)

0.738441536175563

In [18]:
accuracy_score(model3.predict(X_test),y_test)

0.7276727672767277

### CountVectorizer

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train_clean['text'])

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_bow,train_clean['Sentiment'], test_size=0.3, random_state=42)

In [22]:
from sklearn.svm import LinearSVC
model1 = LinearSVC()
model1.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(model1.predict(X_train),y_train)

0.7584866841924791

In [24]:
accuracy_score(model1.predict(X_test),y_test)

0.7456078941227456

In [25]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB()
model2.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
accuracy_score(model2.predict(X_train),y_train)

0.7516001828780432

In [27]:
accuracy_score(model2.predict(X_test),y_test)

0.7424742474247424

In [28]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression()
model3.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
accuracy_score(model3.predict(X_train),y_train)

0.7594582237970053

In [30]:
accuracy_score(model3.predict(X_test),y_test)

0.748041470813748

### Word2Vec

In [11]:
#loading glove model
import numpy as np
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done",len(model)," words loaded!")
    return model

In [12]:
filename = 'glove.twitter.27B.100d.txt.word2vec'
features = loadGloveModel(filename)

Loading Glove Model
Done 1193515  words loaded!


In [56]:
def text2vec(sen):
    vec = [0]*100
    for i in sen.split():
        if i in features.keys():
            vec += features[i]
    global avg_vec
    if len(sen.split())!=0:
        avg_vec = np.array(vec)/len(sen.split())
    return avg_vec   

In [60]:
X = np.array([text2vec(i) for i in train_clean['text']])
y = np.array(train_clean['Sentiment'])

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [62]:
from sklearn.svm import LinearSVC
model1 = LinearSVC()
model1.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [66]:
from sklearn.metrics import accuracy_score
accuracy_score(model1.predict(X_train),y_train)

0.7326551605897816

In [67]:
accuracy_score(model1.predict(X_test),y_test)

0.7327732773277328

In [69]:
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression()
model2.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
accuracy_score(model2.predict(X_train),y_train)

0.732483712424277

In [71]:
accuracy_score(model2.predict(X_test),y_test)

0.7332066539987332