In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.tsv",sep="\t")
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [3]:
test = pd.read_csv("test.tsv",sep="\t")
test.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [4]:
train.groupby("Sentiment").size()

Sentiment
0     7072
1    27273
2    79582
3    32927
4     9206
dtype: int64

In [5]:
import nltk
from nltk.stem import WordNetLemmatizer
chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
wnl = WordNetLemmatizer()
def stemming(sentence):
    alphaSent = ''.join([c for c in sentence if c in chars])
    words = nltk.word_tokenize(sentence)
    cleaned_words = []
    for word,tag in nltk.pos_tag(words):
        if tag.startswith('V'):
            cleaned_words.append(wnl.lemmatize(word, pos='v').lower())
        elif tag.startswith('J') or tag.startswith('R'):
            cleaned_words.append(wnl.lemmatize(word, pos='a').lower())
        elif tag.startswith('NN') or tag.startswith('NN'):
            cleaned_words.append(wnl.lemmatize(word, pos='n').lower())
        else:
            cleaned_words.append(word.lower())
    return " ".join(cleaned_words)

vectorizer = np.vectorize(stemming)

In [6]:
train['Phrase'] = vectorizer(train['Phrase'].values)
test['Phrase'] = vectorizer(test['Phrase'].values)
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapade demonstrate the adage tha...,1
1,2,1,a series of escapade demonstrate the adage tha...,2
2,3,1,a series,2
3,4,1,a,2
4,5,1,series,2
5,6,1,of escapade demonstrate the adage that what be...,2
6,7,1,of,2
7,8,1,escapade demonstrate the adage that what be go...,2
8,9,1,escapade,2
9,10,1,demonstrate the adage that what be good for th...,2


In [7]:
Y = train['Sentiment']
train.drop(['Sentiment','PhraseId','SentenceId'],axis=1,inplace=True)
test_PhraseId = test['PhraseId']
test.drop(['PhraseId','SentenceId'],axis=1,inplace=True)
print(train.shape,test.shape)
X_all = train.append(test)
print(X_all.shape)

(156060, 1) (66292, 1)
(222352, 1)


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
X_all = vect.fit_transform(X_all.Phrase)
X_all.shape

(222352, 16069)

In [9]:
#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf = TfidfTransformer()
#X_all = tfidf.fit_transform(X_all)
#X_all.shape

In [10]:
X = X_all[:len(train)]
X_test = X_all[len(train):]
from sklearn.model_selection import train_test_split
X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size=0.2)
print(X_train.shape,X_test.shape,X_validation.shape)

(124848, 16069) (66292, 16069) (31212, 16069)


In [11]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,Y_train)
mnb.score(X_validation,Y_validation)

0.61123926694860953

In [14]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(n_estimator=100,learning_rate=0.1)
lgbm.fit(X_train.astype('float'),Y_train)
lgbm.score(X_validation.astype('float'),Y_validation)

0.58541586569268234

In [15]:
from sklearn.ensemble import RandomForestClassifier 
rfc = RandomForestClassifier()
rfc.fit(X_train,Y_train)
rfc.score(X_validation,Y_validation)

0.62133153915160833

In [None]:
#Giving Memory Error: Try using GPU on Servers
"""
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.losses import categorical_crossentropy
from keras.utils import np_utils
sentiment_types = 5

nnmodel = Sequential()
nnmodel.add(Dense(62424,activation='relu',input_shape=(16069,)))
nnmodel.add(Dropout(0.2))
nnmodel.add(Dense(1024,activation='relu'))
nnmodel.add(Dropout(0.2))
nnmodel.add(Dense(524,activation='relu'))
nnmodel.add(Dropout(0.2))
nnmodel.add(Dense(sentiment_types,activation='softmax'))

batch_size = 128
epochs=1
train_Labels = np_utils.to_categorical(Y_train)
validation_Labels = np_utils.to_categorical(Y_validation)
nnmodel.compile(loss=categorical_crossentropy,optimizer="adam",metrics=['accuracy'])
nnmodel.fit(
    x=X_train,y=train_Labels,
    epochs=epochs,
    batch_size=batch_size,
    verbose=1,
    validation_data = (X_validation,validation_Labels)
)
"""

In [16]:
rfc.fit(X,Y)
ans = rfc.predict(X_test)

In [17]:
res = np.column_stack((test_PhraseId,ans))
out = pd.DataFrame(data=res,columns=['PhraseId','Sentiment'])
out.to_csv('sol.csv',index=False)