In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
sns.set_style("darkgrid")

## Preliminary Stuff

In [3]:
dirname = '/Users/austinlasseter/DSI-EC-2/projects/datasets/proj_3/'
data = pd.read_csv(dirname + 'clean_data.csv')
# First we need to create the outcome variable
data['over_under']=(train['comments']>train['comments'].median()).astype(int)
print(data.shape)
data.head()

NameError: name 'train' is not defined

In [None]:
# Split the training data into a further train/test split (remember, we set aside 30% earlier for final testing)
X = train.drop(['comments', 'over_under'], axis=1)
y = train['over_under']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)
X_train.shape

## Count Vectorizer on `Title`

In [None]:
# Initialize a standard CountVectorizer and fit the training data.
cvec = CountVectorizer(stop_words='english', max_features=1000)
cvec.fit(X_train['title']);
cvecdata = cvec.transform(X_train['title'])
cvec_title  = pd.DataFrame(cvecdata.todense(),
             columns=cvec.get_feature_names())

In [None]:
# What is the shape of that dataframe?
cvec_title.shape # It has about 2,000 posts and about 6,000 words

In [None]:
# Check out that matrix
cvec_title.head(3)

In [None]:
# What's the distribution of common words?
common=cvec_title.sum()
print(common.describe()) 
# 75% of the words appear less than 2 times.

### Check out the most commonly used words (this is purely exploratory & doesn't affect my model)

In [None]:
common=cvec_title.sum()

In [None]:
# Because we set max_feature=1000, these are the 1000 most commonly used words. About half get 4+ appearances.
print(common.describe())

In [None]:
# List the 15 most commonly used words.
common.sort_values(ascending=False).head(15) 

In [None]:
#  Here's the distribution of the common words (>2 times)
fig = plt.figure(figsize=(12,5))
ax=sns.distplot(common[common>2]);
ax.set(ylabel='Distribution', xlabel='Number of times a word shows up', title='Almost all words < 10 times.');

## Combine the vectorized `title` with my other feature columns

In [None]:
print(X_train.shape)
X_train.head(3)

In [None]:
# Drop the variables I don't need anymore.
X_train_short=X_train.drop(['title', 'subreddit', 'which_sub', 'others'], axis=1)

In [None]:
# Confirm that the two df's I'm about to merge have the same number of rows
print(X_train_short.shape)
print(cvec_title.shape)

In [None]:
# When I merge, I'm losing a lot of rows. But when I concat, I gain them. Why is that?
X_train_cvec=pd.merge(X_train_short, cvec_title, right_index=True, left_index=True)
X_train_cvec2=pd.concat([X_train_short, cvec_title], axis=1)
print(X_train_cvec.shape) # There are no NaN's.
print(X_train_cvec2.shape) # I'm also getting a lot of NaN's here.

## Train a KNN regression  on the vectorized `title` data

In [None]:
X_train.head()

In [None]:
#train the model
neighs = KNeighborsClassifier()
knn_model = neighs.fit(cvec_title, y_train)
# One thing I don't get... we aren't using any of the other features we engineered (length, time, subreddit)

In [None]:
# Now I want to use test data. I'll need to transform my test data the same way I transformed my train data
cvec = CountVectorizer(stop_words='english') # kill this
vectorizers_test = cvec.transform(X_test['title']) # dont fit
#transforming but not fitting my test data.
df_test = pd.DataFrame(vectorizers_test.todense(),
             columns=cvec.get_feature_names())
#putting it all into a dataframe to make it easier to understand

Ultimately, I chose not to use the updated list of stopwords.

In [None]:
# I expected these three dataframes to have the same shape, but they do not.
print(X_test.shape)
print(y_test.shape)
print(df_test.shape)

In [None]:
#knn_model.score(df_test, y_test)

## Train a logistic regression on the vectorized data

In [None]:
cvec = CountVectorizer(stop_words='english')
cvec.fit(X_train['title'])
X_train = pd.DataFrame(cvec.transform(X_train['title']).todense())
X_test = pd.DataFrame(cvec.transform(X_test['title']).todense())

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)