In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


In [17]:
# Read train.csv
df_train = pd.read_csv('data/yahoo_answers/train.csv', header=None)
df_train.head()

Unnamed: 0,0,1,2,3
0,5,why doesn't an optical mouse work on a glass t...,or even on some surfaces?,Optical mice use an LED and a camera to rapidl...
1,6,What is the best off-road motorcycle trail ?,long-distance trail throughout CA,i hear that the mojave road is amazing!<br />\...
2,3,What is Trans Fat? How to reduce that?,I heard that tras fat is bad for the body. Wh...,Trans fats occur in manufactured foods during ...
3,7,How many planes Fedex has?,I heard that it is the largest airline in the ...,according to the www.fedex.com web site:\nAir ...
4,7,"In the san francisco bay area, does it make se...",the prices of rent and the price of buying doe...,renting vs buying depends on your goals. <br /...


In [18]:
# Read test.csv
df_test = pd.read_csv('data/yahoo_answers/test.csv', header=None)
df_test.head()

Unnamed: 0,0,1,2,3
0,9,What makes friendship click?,How does the spark keep going?,good communication is what does it. Can you m...
1,2,Why does Zebras have stripes?,What is the purpose or those stripes? Who do t...,this provides camouflage - predator vision is ...
2,4,What did the itsy bitsy sipder climb up?,,waterspout
3,4,What is the difference between a Bachelors and...,,One difference between a Bachelors and a Maste...
4,3,Why do women get PMS?,,Premenstrual syndrome (PMS) is a group of symp...


In [19]:
# Combine the question title [1], question text [2], and the best answer [3]
df_train['text'] = df_train[1].fillna('') + " " + df_train[2].fillna('') + " " + df_train[3].fillna('')
df_test['text'] = df_test[1].fillna('') + " " + df_test[2].fillna('') + " " + df_test[3].fillna('')
#df_train.head()
#df_test.head()
#df_train.loc[1,'text']
#df_test.loc[2,'text']

In [20]:
# Split the data into X_train (the combined text) and y_train (classes)
X_train = df_train['text']
y_train = df_train[0]
# Same with X_test and y_test
X_test = df_test['text']
y_test = df_test[0]

In [None]:
# Instantiating the TfidfVectorizer and fitting it to our training data, converting our 
# collection of text documents into a matrix of token counts for Bags of Words with TFIDF
# max_features = 50000 selects the 50,000 most frequent words from the training set
## For hyperparameter tuning changed the max_features, ngram_range, and min_df
vect = TfidfVectorizer(max_features = 50000).fit(X_train)
len(vect.get_feature_names())

In [None]:
# Transform the documents in X_train to a document term matrix,
# which gives us the bags-of-word representation of X_train.
# Each row corresponds to a document, and each column is a word from our training vocabulary
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

In [None]:
# Now train the Logistic Regression classifier based on this feature matrix
# X_train_vectorized
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

In [None]:
# Make predictions using X_test
predictions = model.predict(vect.transform(X_test))

In [None]:
# Compute the accuracy score of the prediction
from sklearn.metrics import classification_report
print('Accuracy Score: ', accuracy_score(y_test, predictions))
# Original --> 0.7015166666666667
# TfidfVectorizer(max_features = 200000) --> 0.7027333333333333
# TfidfVectorizer(max_features = 200000, ngram_range = (1,2)) --> 0.69815
print("\n\n\n", classification_report(y_test, predictions))

In [None]:
### Just for reference ###
# Determine each feature's weight in terms of positivity and negativity
# Using the coefficients for each feature(a word)
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest Tfidf: \n{}\n'.format(feature_names[sorted_tfidf_index[:-11:-1]]))


In [None]:
# Just trying to calculate the accuracy on my own
x = 0
for i in range(len(y_test)):
    if predictions[i] != y_test[i]:
        x+=1
print(1-(x/len(predictions)))