In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
import pandas as pd, numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import classification_report
from nltk.tokenize import TreebankWordTokenizer
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
#from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
def create_dataframe(filename):
    df = pd.read_csv(filename, encoding = 'UTF-8')
    return df

In [25]:
def preprocess(X, custom_stop = [], stem = False):
    en_stop = get_stop_words('en')
    en_stop.extend(custom_stop)
    tokenizer = RegexpTokenizer(r'\w+')
#     tokenizer = TreebankWordTokenizer()
    p_stemmer = PorterStemmer()
    list_of_X = X.apply(lambda row: row.lower())
    list_of_X = X.apply(lambda row: tokenizer.tokenize(row))
    list_of_X = list_of_X.apply(lambda row: [i for i in row if i not in en_stop])
    if stem == True:
        list_of_X = list_of_X.apply(lambda row: [p_stemmer.stem(i) for i in row])
    return list_of_X

In [26]:
train = create_dataframe('train_data.csv')
test = create_dataframe('test_data.csv')

In [27]:
def split_Xy(data):
    X = data['message']
    y = data.iloc[:,1:].astype(str).replace({'T':1,'F':0})
    y = y.idxmax(axis=1)
    X = pd.Series(X)
    y = pd.Series(y)
    y = LabelEncoder().fit_transform(y)
    return X,y

In [28]:
X_train , y_train = split_Xy(train)
X_test , y_test = split_Xy(test)

In [29]:
X_train_for_cloud = preprocess(X_train, stem = True)

In [30]:
# def create_word_cloud(X):
#     text = []
#     for sentence in X:
#         text.extend(sentence)
#     textall = " ".join(text)
#     wordcloud = WordCloud(max_font_size=40).generate(textall)
#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis("off")
#     plt.show()

In [31]:
# create_word_cloud(X_train_for_cloud)

In [32]:
X_train = preprocess(X_train,stem = True, custom_stop = ['hi','api_name','please','help','user_id'])
X_test  = preprocess(X_test, stem = True)

In [33]:
X_train

0                                          [7am, everyday]
1                                           [chocol, cake]
2              [close, mortic, tenon, joint, door, diment]
3                                  [train, eppo, kelambum]
4                      [yesterday, cancel, flight, ticket]
5                                       [chamg, 12pm, 9pm]
6                                    [want, go, rajasthan]
7                                                   [room]
8                            [can, arrang, flight, ticket]
9                                           [kind, remind]
10                                 [jamshedpur, jharkhand]
11                                     [noidaa, secot, 44]
12                                      [flight, spicejet]
13                                                  [uber]
14                                              [3, 3, 17]
15                                            [fare, high]
16                       [know, train, run, jalgaon, pun

In [34]:
def create_vect(X_train, X_test):
    X_train = [' '.join(sentence) for  sentence in X_train]
    X_test = [' '.join(sentence) for  sentence in X_test]
    vect = CountVectorizer()
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_test_dtm = vect.transform(X_test)
    return X_train_dtm , X_test_dtm

In [35]:
X_train_dtm , X_test_dtm = create_vect(X_train , X_test)


In [36]:
print X_train_dtm.shape[0] - len(y_train)
print X_test_dtm.shape[0] - len(y_test)

0
0


In [37]:
def prediction(X_train_dtm, y_train, X_test_dtm, y_test):
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_predictions = nb.predict(X_test_dtm)
    print classification_report(y_predictions, y_test)
    return accuracy_score(y_predictions, y_test)

In [38]:
prediction(X_train_dtm,y_train,X_test_dtm,y_test)

             precision    recall  f1-score   support

          0       0.67      0.89      0.76      2422
          1       0.62      0.84      0.72       604
          2       0.66      0.92      0.77       312
          3       0.71      0.67      0.69       733
          4       0.68      0.90      0.78      1223
          5       0.78      0.61      0.69       414
          6       0.84      0.82      0.83       884
          7       0.42      0.91      0.58       102
          8       0.97      0.52      0.68      3306

avg / total       0.79      0.74      0.73     10000



0.73580000000000001

In [39]:
X_train_dtm.shape
X_test_dtm.shape
y_train.shape
X_train_dtm.shape

(40659, 18836)