In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
pd.options.display.float_format = '${:,.10f}'.format
float_formatter = lambda x: "%.10f" % x
np.set_printoptions(formatter={'float_kind':float_formatter})

In [3]:
file = 'tweets_public.csv'

df = pd.read_csv(file)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 15 columns):
tweet_id                        8784 non-null int64
airline_sentiment               8784 non-null object
airline_sentiment_confidence    8784 non-null float64
negativereason                  5531 non-null object
negativereason_confidence       6325 non-null float64
airline                         8784 non-null object
airline_sentiment_gold          24 non-null object
name                            8784 non-null object
negativereason_gold             20 non-null object
retweet_count                   8784 non-null int64
text                            8784 non-null object
tweet_coord                     608 non-null object
tweet_created                   8784 non-null object
tweet_location                  5936 non-null object
user_timezone                   5973 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 1.0+ MB


In [4]:
df.airline_sentiment = pd.Categorical(df.airline_sentiment)

In [5]:
corpus = []

for i in range(len(df.index)):
    tweet = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    tweet = tweet.lower()
    tweet = tweet.split()

    ps = PorterStemmer()

    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]

    tweet = ' '.join(tweet)

    corpus.append(tweet)

df['text'] = corpus

In [6]:
train, test = train_test_split(df, test_size=0.25)

In [7]:
cv = CountVectorizer(max_features=1500, ngram_range=(1,2), min_df=1)

In [8]:
x_train = cv.fit_transform(train['text'])

In [9]:
y_train = train['airline_sentiment'].values

In [10]:
x_test = cv.transform(test['text'])

In [11]:
y_test = test['airline_sentiment'].values

In [12]:
BernoulliNB = BernoulliNB()

In [13]:
BernoulliNB.fit(x_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [14]:
y_predBNB = BernoulliNB.predict(x_test)

In [15]:
score = accuracy_score(y_test, y_predBNB)
print("Model score is: {}".format(score))

Model score is: 0.7723132969034608


In [16]:
KNeighborsClassifier = KNeighborsClassifier()

In [17]:
KNeighborsClassifier.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
y_predKNC = KNeighborsClassifier.predict(x_test)

In [19]:
score = accuracy_score(y_test, y_predKNC)
print("Model score is: {}".format(score))

Model score is: 0.5519125683060109


In [20]:
df_submission = pd.read_csv('tweets_submission.csv', index_col='tweet_id')

In [21]:
corpus = []

for i in range(len(df.index)):
    tweet = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    tweet = tweet.lower()
    tweet = tweet.split()

    ps = PorterStemmer()

    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]

    tweet = ' '.join(tweet)

    corpus.append(tweet)

df['text'] = corpus

In [22]:
cv = CountVectorizer(max_features=1500, ngram_range=(1,2), min_df=1)

In [23]:
train = df

In [24]:
test = df_submission

In [25]:
x_train = cv.fit_transform(train['text'])

In [26]:
y_train = train['airline_sentiment'].values

In [27]:
x_test = cv.transform(test['text'])

In [28]:
BernoulliNB.fit(x_train,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [29]:
y_pred = BernoulliNB.predict(x_test)

In [31]:
date = dt.datetime.now().strftime("%m_%d_%Y-%H_%M_%S")
filename = 'submission_' + date + '.csv'

df_submission['airline_sentiment'] = y_pred
df_submission[['airline_sentiment']].to_csv(filename)

print('Submission file created: {}'.format(filename))
print('Upload it to Kaggle InClass')

Submission file created: submission_02_20_2018-22_04_57.csv
Upload it to Kaggle InClass
