In [3]:
import ISO_Networks as ISON
import datawrangler
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words=ISON.stop_words

# LOAD RAW DOCUMENTS

In [2]:
C, F, T =ISON.load_universe()

In [3]:
cmu=C.text.tolist()
films=F.text.tolist()
television=T.text.tolist()

# LOAD CLEANED DATA
    Bag of Words - Clean

In [4]:
# universe_cmu=[ISON.film_strip(i) for i in cmu]
universe_film=[ISON.film_strip(i) for i in films]
# universe_tv =[ISON.film_strip(i) for i in television]

    Bag of Strings

In [10]:
# string_cmu =[' '.join(bow) for bow in universe_cmu]
string_film=[' '.join(bow) for bow in universe_film]
# string_tv=[' '.join(bow) for bow in universe_tv]

In [None]:
data_y = np.array(ISON.labels_film(F))

# N-GRAM Models
The code below translates a corpus into a bag of words matrix either via tfidf or simple ngram counts. 

In [8]:
def tf_idf_model(corpus):
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = stop_words)
    matrix=tf.fit_transform(corpus)
    feature_names = tf.get_feature_names() 
    return matrix, feature_names


def n_gram_counts_model(corpus):
    cv= CountVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = stop_words)

    matrix=cv.fit_transform(corpus)
    feature_names = cv.get_feature_names() 
    return matrix, feature_names

    

In [11]:
%time tfM, ft = tf_idf_model(string_film)

CPU times: user 5min 5s, sys: 54.1 s, total: 5min 59s
Wall time: 6min 14s


In [22]:
%time tfC, ftC = n_gram_counts_model(string_film)

CPU times: user 4min 34s, sys: 1min 4s, total: 5min 39s
Wall time: 6min 8s


In [23]:
tfC.shape

(2827, 14130866)

# Machine Learning Model
The vectors obtained by the simple bag of words model above are then fed into a simple logistic regression model below. The ISON_Film_experiment function takes the sparse vector matrix of the film corpus, the labels, and a desired test size and trains a logistic regression classifier on the data.

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn import preprocessing


def train_test(data, labels, test_size):
    X_train, X_test, y_train, y_test = train_test_split(data, labels,
                                                    stratify=labels, 
                                                    test_size=test_size)
    return X_train, X_test, y_train, y_test

def ISON_Film_experiment(universe_representation, labels, test_size, model, dictionary):
    
    
    #STEP 1: SPLIT THE DATA
    X_train, X_test, y_train, y_test=train_test(universe_representation,labels,test_size)
    print('START')
    
    #STEP 2: 
    RLR = RandomizedLogisticRegression()
    le = preprocessing.LabelEncoder()
    label_encoded=le.fit_transform(labels)
    randomizedLRM=RLR.fit(universe_representation, label_encoded)
    
    
    
    
    feature_scores=randomizedLRM.all_scores_
    index = np.where(feature_scores!= 0)[0]
    
    model_ML = Pipeline([
    ('clf',OneVsRestClassifier(LogisticRegression(solver='sag')))
                                                                    ])
    
    print('Fitting model')
    clf=model_ML.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('\n')
    print(classification_report( y_test, y_pred))
    print((y_pred == y_test).mean())
    
    print('%s features were of importance, the following are the weights:' %len(index))
    print(feature_scores[index])
    print('#'*33, '\n')
    
    topics = datawrangler.topic_items(model, 20)
    for i in index:
        print(feature_scores[i],i,  topics[i])

    
    print('#'*33, '\n')
    

In [None]:
# Logistic Regression with CountVectorizer()
%time ISON_Film_experiment(tfM,data_y, .20)

START




In [20]:
# Logistic Regression with Tfidf_vectorizer()
%time ISON_Film_experiment(tfM,data_y, .20)

START
Fitting model


             precision    recall  f1-score   support

     Action       0.02      0.02      0.02        58
  Adventure       0.00      0.00      0.00        33
     Comedy       0.00      0.00      0.00        70
      Crime       0.00      0.00      0.00        40
      Drama       0.20      0.57      0.30       116
    Fantasy       0.00      0.00      0.00        23
     Horror       0.00      0.00      0.00        30
    Mystery       0.00      0.00      0.00        21
      Other       0.22      0.06      0.10        31
    Romance       0.00      0.00      0.00        38
     Sci-Fi       0.00      0.00      0.00        31
   Thriller       0.00      0.00      0.00        75

avg / total       0.05      0.12      0.07       566


  'precision', 'predicted', average, warn_for)



0.12190812720848057
################################# 

CPU times: user 11min 40s, sys: 23 s, total: 12min 3s
Wall time: 12min 29s


# Topic Models
The code below implements the second modeling technique which utilizes a topic model generate from one corpus to model topics in a second corpus. In this case the topic model generate was trained on the TV corpus, and then this model was used to create a bag of topics model in which each document in the FILM corpus is reprsented by a vector of topics.

In [13]:
# produce a dense M x T matrix to train classifier on
def document_theme_vector(data, model, dictionary):
    theme_matrix = []
    weight =lambda x: np.array([i[1] for i in x])
    
    for i in data:
        theme_array=model.get_document_topics(dictionary.doc2bow(i), minimum_probability=0)
        vector=weight(theme_array)
        theme_matrix.append(vector)
    

    theme_matrix = np.array(theme_matrix)
    return theme_matrix

In [5]:
#Load model and use to vectorize training corpus
model, dictionary = datawrangler.load_model("tvTOP50_50.model")
universe_film=ISON.depickler('universe_film.p')

In [15]:
%time M=document_theme_vector(universe_film,model, dictionary)

CPU times: user 53.9 s, sys: 3.13 s, total: 57 s
Wall time: 54.8 s


In [22]:
data_y = ISON.labels_film(F)

In [30]:
%time ISON_Film_experiment(M, data_y, .20, model, dictionary)

START




Fitting model


             precision    recall  f1-score   support

     Action       0.33      0.12      0.18        58
  Adventure       0.00      0.00      0.00        33
     Comedy       0.52      0.21      0.30        70
      Crime       0.00      0.00      0.00        40
      Drama       0.23      0.84      0.36       116
    Fantasy       0.00      0.00      0.00        23
     Horror       0.00      0.00      0.00        30
      Other       0.00      0.00      0.00        52
    Romance       0.00      0.00      0.00        38
     Sci-Fi       0.40      0.06      0.11        31
   Thriller       0.17      0.19      0.18        75

avg / total       0.19      0.24      0.16       566

0.23851590106007067
28 features were of importance, the following are the weights:
[[0.565]
 [0.105]
 [0.47 ]
 [0.105]
 [0.61 ]
 [0.175]
 [0.8  ]
 [0.5  ]
 [0.18 ]
 [0.25 ]
 [0.46 ]
 [0.005]
 [0.01 ]
 [0.365]
 [0.685]
 [0.32 ]
 [0.68 ]
 [0.95 ]
 [0.125]
 [0.005]
 [0.535]
 [0.005]
 [0.16 ]
 [

  'precision', 'predicted', average, warn_for)
