In [3]:
import pandas as pd
import numpy as np

yt_data = pd.read_csv('yt_edu.csv', index_col = 0)
yt_data.head()

Unnamed: 0,ytID,transcript,confidence,channelId,title,tags,description,duration,label
0,--rJwJuXuOI,yeah Peter Fairly OddParents Danny Phantom TU...,0.93132,UC80Z-cIOdR6upfnrPJ8Q00A,Becoming a CARTOON! | Thomas Sanders feat. But...,Thomas Sanders thomas sanders vine thomas sand...,"By now, I think it's pretty apparent that I ha...",551.0,rec
1,-4XvTqhMbjo,season 2 if you have been watching since,0.920828,UCIRwyPYF1_7mCplRFGB_Png,Tatianna featuring Katya The Same Parts,[],Night Of The Living Drag @ Stage 48 \nNYC\n10/...,394.0,rec
2,-4YF2u-wx7A,what does feminism mean to you I think there'...,0.946498,UCi_BjZoqPnMmkCLv2FlyBxQ,What Does Feminism Mean to You? (Smart Snacks ...,smart girls chvrches indie music independent l...,"Anna Bulbrook (GIRLSCHOOL, The Bulls, The Airb...",27.0,edu
3,-4bGPF-8Syo,walking Hills like her she came in looking li...,0.854905,UCfm4y4rHF5HGrSr-qbvOwOg,"Three Girls, One Elevator (ft. Zendaya & Winni...",iisuperwomanii iisuperwomenii superwoman super...,Wouldn't it be awesome if more women said hi t...,116.0,rec
4,-5I4FCUo7AA,you don't look like a superhero that it's alr...,0.902851,UCi9cDo6239RAzPpBZO9y5SA,"Are We Frenemies?! | Lele Pons, Hannah Stockin...",are we frenemies lele pons hannah stocking anw...,WATCH CELOSO MUSIC VIDEO HERE ▶ https://youtu....,402.0,rec


In [4]:
## IF-IDF Logistic regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [5]:
# Split in train test (default 0.25 test)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(yt_data['transcript'],yt_data['label'])

In [10]:
# Vectorize raw test data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)

In [11]:
# Train LR classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Test model accuracy, 5x cross validation
x_test = vectorizer.transform(X_test_raw)
scores = cross_val_score(classifier, x_test, y_test, cv=5)
print(scores)

[0.81132075 0.88679245 0.86792453 0.86538462 0.82692308]


In [20]:
# Do predictions
predictions = classifier.predict(x_test)

In [24]:
# report metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

        edu       0.84      0.84      0.84       123
        rec       0.86      0.86      0.86       140

avg / total       0.85      0.85      0.85       263



In [62]:
# Single prediction
text = X_test_raw[16]
exemplar = vectorizer.transform([text]) # example string
exemplar_cat = classifier.predict(exemplar) # returns category
exemplar_prob = classifier.predict_proba(exemplar) # returns probability

if exemplar_prob[0][0] > exemplar_prob[0][1]:
    class_prob = exemplar_prob[0][0]
else: class_prob = exemplar_prob[0][1]

print('This transcript was classified as ' + exemplar_cat + ', with a probability of ' + str(class_prob))
print(text[1:280])

['This transcript was classified as rec, with a probability of 0.5823674460678282']
should I get your jacket that you took mine is not dead or something like that it's because of romantic movies they set the bar too high for the rest of us what does the same thing every year I thought about it actually idea that we should do that is a trailer and also the mean 


In [63]:
# Most important features
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(exemplar.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

['jeans' 'somebody' 'should']


In [64]:
## Save trained LR model
import pickle

file_Name = "classifier_edu"

fileObject = open(file_Name,'wb') 
pickle.dump(classifier,fileObject)   
fileObject.close()

In [None]:
# To open the file for reading
fileObject = open(file_Name,'rb')
classifier_pol_bias = pickle.load(fileObject)  
fileObject.close()