In [1]:
import numpy  as np
import pandas as pd

In [2]:
train_data = pd.read_csv("x_y_train.csv")
test_data = pd.read_csv("x_test.csv")

In [3]:
x_train = train_data['text'].values
y_train = train_data['airline_sentiment'].values
x_test  = test_data['text'].values

### Importing Stopwords

In [4]:
from nltk.corpus import stopwords
import string
stop_words = stopwords.words('english')
stop_words += list(string.punctuation)

In [5]:
from sklearn.model_selection import train_test_split
xt_train, xt_test, yt_train, yt_test = train_test_split(x_train, y_train, random_state = 0)

### Using Count Vectoriser

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 4000, analyzer = 'word', stop_words = stop_words, ngram_range = (1,4), max_df = 0.8)

In [9]:
xt_train_vec = cv.fit_transform(xt_train)
xt_test_vec  = cv.transform(xt_test)

### Training and Testing the data using various classifiers

Support Vector Classifer

In [10]:
from sklearn.svm import SVC
svc = SVC(gamma = 'auto')
svc.fit(xt_train_vec, yt_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
svc.score(xt_test_vec, yt_test)

0.6331511839708561

'Multinomial' Naive Bayes

In [12]:
from sklearn.naive_bayes import MultinomialNB
NBClassifer = MultinomialNB(alpha = 0.1)
NBClassifer.fit(xt_train_vec, yt_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [13]:
NBClassifer.score(xt_test_vec, yt_test)

0.751183970856102

Implementing Grid Search

In [14]:
from sklearn.model_selection import GridSearchCV
d = {
    'alpha' : [0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19]
}
gcv = GridSearchCV(NBClassifer, d)

In [15]:
gcv.fit(xt_train_vec, yt_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=0.1, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16,
                                   0.17, 0.18, 0.19]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [16]:
gcv.best_estimator_

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [17]:
gcv.cv_results_

{'mean_fit_time': array([0.02091026, 0.01526888, 0.03692937, 0.02592564, 0.01778507,
        0.02122362, 0.0156517 , 0.01436869, 0.01554807, 0.01627   ]),
 'std_fit_time': array([0.00554187, 0.00064061, 0.02727756, 0.00518992, 0.00055637,
        0.0042796 , 0.00077438, 0.00013257, 0.00108539, 0.00082567]),
 'mean_score_time': array([0.00350451, 0.00210579, 0.00250999, 0.00233396, 0.00296362,
        0.00272926, 0.00230328, 0.00215403, 0.00229065, 0.00228532]),
 'std_score_time': array([7.59452585e-04, 3.25103255e-05, 5.61605961e-04, 2.96447040e-04,
        1.15783147e-03, 3.24851903e-04, 2.21326535e-04, 7.04069436e-05,
        1.81906276e-04, 2.08482048e-04]),
 'param_alpha': masked_array(data=[0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18,
                    0.19],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 0.1},
  {'alpha': 0.11},
  {'a

In [18]:
gcv.score(xt_test_vec, yt_test)

0.751183970856102

K-Nearest Neighbour

In [19]:
from sklearn.neighbors import KNeighborsClassifier
KNNClassifier = KNeighborsClassifier()
KNNClassifier.fit(xt_train_vec, yt_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [20]:
KNNClassifier.score(xt_test_vec, yt_test)

0.4786885245901639

### Using Naive Bayes for Prediction

In [21]:
x_test_vec  = cv.transform(x_test)
y_predicted = gcv.predict(x_test_vec)

In [22]:
np.savetxt("output.csv", y_predicted, fmt='%s')