In [10]:
import pickle as pic
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV as gsearch
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier as forest
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.cross_validation import KFold
import nltk.data
%matplotlib inline
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keenan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def save_obj(obj, name):
    with open(name, 'wb') as f:
        pic.dump(obj, f, pic.HIGHEST_PROTOCOL)
def load_obj(name ):
    with open(name, 'rb') as f:
        return pic.load(f)

In [3]:
five_k = load_obj('data/tmdb_df_5k.pickle')

In [4]:
def df_overview_clean(df):
    ret_df = df.copy()
    return ret_df[['title', 'genres', 'overview']]

In [5]:
five_k_over = df_overview_clean(five_k)

In [6]:
classes = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
           'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music',
            'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']

In [7]:
def genre_extract(genre_dic):
    ret_genres = []
    for dic in genre_dic:
        ret_genres.append(dic['name'])
    return ret_genres

def assign_genre_vector(row_genres, all_classes):
    ret_list = []
    genre_list = genre_extract(row_genres)
    for genre in all_classes:
        if genre in genre_list:
            ret_list.append(1)
        else:
            ret_list.append(0)
    return ret_list           

In [8]:
five_k_vec = pd.DataFrame(columns=['title', 'genres', 'overview', 'genre_vec', 'all_genres'])
for row in zip(five_k_over['title'], five_k_over['genres'], five_k_over['overview']):
    genre_vec = assign_genre_vector(row[1], classes)
    genres = genre_extract(row[1])
    
    #had to throw this in for the vectorization process-later
    overview = row[2]
    if row[2] == None or not row[2]:
        overview = 'a'
    five_k_vec = five_k_vec.append({'title':row[0], 'genres':row[1], 'overview':overview, 'genre_vec':genre_vec, 'all_genres':genres}, ignore_index=True)

In [9]:
#set our vectorization
vectorizer = CountVectorizer(stop_words=stopwords.words("english"),
    max_df = 0.9, 
    min_df = 5, 
    dtype=np.float32 )

corpus = five_k_vec['overview'].values
X = vectorizer.fit_transform(corpus).toarray()
ys = MultiLabelBinarizer().fit_transform(five_k_vec['all_genres'])

In [10]:
#http://stackoverflow.com/questions/36523558/multi-class-logistic-regression-in-scikit-learn
#And you seem to be looking for multilabel (as for multiclass labels should be 1-dim). 
#Currently, in sklearn, the only methods supporting multilabel are:
#Decision Trees, Random Forests, Nearest Neighbors, Ridge Regression.
#have to use onevsrest classification

In [13]:
def cross_val_mod(X_train, y_train, mod, parameters, folds = 5):
    grid_clf = gsearch(mod, parameters, cv=folds)
    grid_clf.fit(X_train, y_train)
    return (grid_clf.best_estimator_, grid_clf.best_params_)

In [None]:
#Log Reg without CV
X_train,X_test,y_train,y_test = train_test_split(X,ys,test_size=0.3)

oneVsResClassifier = OneVsRestClassifier(LogReg()).fit(X_train, y_train)

oneVsResClassifier.score(X_test, y_test)

In [None]:
#forest without CV
X_train,X_test,y_train,y_test = train_test_split(X,ys,test_size=0.3)

forest_mod = forest()
forest_mod.fit(X_train, y_train)

forest_mod.score(X_test, y_test)

These only perform as well as the baseline, probably because they are over classifying the most popular classes

In [16]:
#forest with CV
X_train,X_test,y_train,y_test = train_test_split(X,ys,test_size=0.3)

params = {'n_estimators': [5, 10, 15, 20],
          'max_depth': [2, 5, 7, 9]}
cv_mod = cross_val_mod(X_train, y_train, forest(), params, 10)

print cv_mod[0].score(X_test, y_test)
print cv_mod[1]

0.00866666666667
{'n_estimators': 5, 'max_depth': 9}


In [29]:
#Log Reg with CV
X_train,X_test,y_train,y_test = train_test_split(X,ys,test_size=0.3)

all_c = np.power(10.0, range(-3, 3))

params = {'estimator__C':all_c}

cv_mod = cross_val_mod(X_train, y_train, OneVsRestClassifier(LogReg(class_weight='balanced')), params, 10)

print cv_mod[0].score(X_test, y_test)
print cv_mod[1]

KeyboardInterrupt: 