# TF-IDF Log. Reg. classifier for educational vs entertainment YouTube videos
## Training

In [None]:
import pandas as pd
import numpy as np

yt_data = pd.read_csv('yt_edu.csv', index_col = 0)
yt_data.head()

In [None]:
## IF-IDF Logistic regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
# Split in train test (default 0.25 test)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(yt_data['transcript'],yt_data['label'])

In [None]:
# Vectorize raw test data
vectorizer = TfidfVectorizer(max_features = 1000, ngram_range = (1,2))
X_train = vectorizer.fit_transform(X_train_raw)

In [None]:
# Train LR classifier
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

In [None]:
# Test model accuracy, 5x cross validation
x_test = vectorizer.transform(X_test_raw)
scores = cross_val_score(classifier, x_test, y_test, cv=5)
print(scores)

In [None]:
# Do predictions
predictions = classifier.predict(x_test)

In [None]:
# report metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

Scores for default:
[0.81132075 0.88679245 0.86792453 0.86538462 0.82692308]

                  precision    recall  f1-score   support

        edu       0.84      0.84      0.84       123
        rec       0.86      0.86      0.86       140
        avg/total 0.85      0.85      0.85       263
        
With 1-3 ngrams same f1 score, less balanced for category
Best with max features = 1000, ngrams 1-2

In [None]:
# Single prediction
text = X_test_raw[16]c
exemplar = vectorizer.transform([text]) # example string
exemplar_cat = classifier.predict(exemplar) # returns category
exemplar_prob = classifier.predict_proba(exemplar) # returns probability

if exemplar_prob[0][0] > exemplar_prob[0][1]:
    class_prob = exemplar_prob[0][0]
else: class_prob = exemplar_prob[0][1]

print('This transcript was classified as ' + exemplar_cat + ', with a probability of ' + str(class_prob))
print(text[1:280])

In [None]:
# Most important features
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(exemplar.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

In [None]:
## Save trained LR model
import pickle

pickle.dump(vectorizer, open("vectorizer_edu.pickle", "wb"))
pickle.dump(classifier, open("classifier_edu.pickle", "wb"))

## Classification with canned model

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression

In [3]:
vectorizer = pickle.load(open("vectorizer_edu.pickle", "rb"))
classifier = pickle.load(open("classifier_edu.pickle", "rb"))

In [None]:
# Single prediction
with open('N7-dg88gj6M_t.txt', 'r') as myfile:
             transcript=myfile.read().replace('\n', '')
        
text = transcript

exemplar = vectorizer.transform([text]) # example string
exemplar_cat = classifier.predict(exemplar) # returns category
exemplar_prob = classifier.predict_proba(exemplar) # returns probability

if exemplar_prob[0][0] > exemplar_prob[0][1]:
    class_prob = exemplar_prob[0][0]
else: class_prob = exemplar_prob[0][1]

print('This transcript was classified as ' + exemplar_cat + ', with a probability of ' + str(class_prob))
print(text[1:280])

In [None]:
# Most important features
feature_array = np.array(vectorizer.get_feature_names())
tfidf_sorting = np.argsort(exemplar.toarray()).flatten()[::-1]

n = 3
top_n = feature_array[tfidf_sorting][:n]
print(top_n)

In [1]:
2174.00/2

1087.0