## Create new 'text' and 'end' columns in unlabeled and labeled datasets

In [34]:
import pandas as pd

LABELED_FILE_NAME = "labeled.csv"
UNLABELED_FILE_NAME = "unlabeled.csv"
OUTPUT_CSV_FILE_PATH = "new_data_predicted.csv"
FILES_TO_TAG = [LABELED_FILE_NAME, UNLABELED_FILE_NAME]
SELECTED_COLUMNS = {
    "TITLE": "Title",
    "UPLOADER": "UploaderUsername",
    "UPLOADED_MM_YYYY": "SamplingCategory",
    "VIEWCOUNT":"ViewCount",
    "DESCRIPTION": "Description"
}
TEXT_COL_NAME = "text"
END_COL_NAME = "end"
for file_name in FILES_TO_TAG:
    preproc_labeled_df = pd.read_csv(file_name, encoding='utf-8')
    for index, row in preproc_labeled_df.iterrows():
        text_col_initial_text = ""
        for key, value in SELECTED_COLUMNS.items():
            text_in_cell = str(preproc_labeled_df.loc[index, value])
            text_col_initial_text += key + ": " + text_in_cell + "\n"
        preproc_labeled_df.at[index, TEXT_COL_NAME] = text_col_initial_text
        preproc_labeled_df.at[index, END_COL_NAME] = 1
    preproc_labeled_df.to_csv(file_name, index=False, encoding='utf-8-sig')

## Download packages, split test and training data

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.utils import class_weight
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix
import spacy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import sys
import os
import warnings
import pandas as pd
import json
import string
import re
import nltk
import random
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from scipy.signal import savgol_filter
from kneed import KneeLocator
warnings.filterwarnings("ignore")
from kneed import DataGenerator, KneeLocator

In [37]:
# df = pd.read_csv('00.diplomacy_corpus_sep18.csv', encoding='cp1252')
df = pd.read_csv(LABELED_FILE_NAME, encoding='utf-8')
print(df.shape)
#df = df[df['message'].notna()]

# df['perception'] = df.apply(lambda row: 0 if row['reciever_perception'] == 'Truth' else 1, axis=1)
# train = df[(df['train_test'] == 'Train') | (df['train_test'] == 'Validation')]
# test = df[df['train_test'] == 'Test']
train, test = train_test_split(df, test_size=0.2)
print(len(test))

(312, 17)
63


## Bag of words and test only logreg

In [38]:
nlp = English()

def is_number(tok):
    try:
        float(tok)
        return True
    except ValueError:
        return False

def spacy_tokenizer(text):
    return [tok.text if not is_number(tok.text) else '_NUM_' for tok in nlp(text)]


In [39]:
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, stop_words=STOP_WORDS, strip_accents='unicode')
corpus = list(train['text'].str.lower())
X_train = vectorizer.fit_transform(corpus)

vectorizer2 = CountVectorizer(tokenizer=spacy_tokenizer, vocabulary=vectorizer.vocabulary_, stop_words=STOP_WORDS, strip_accents='unicode')
X_test = vectorizer2.fit_transform(list(test['text'].str.lower()))

In [40]:
X_train = csr_matrix(X_train)
y_train = train['IsCollab']
X_test = csr_matrix(X_test)
y_test = test['IsCollab']

In [41]:
logreg = LogisticRegression(class_weight = 'balanced', max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [42]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.841     0.925     0.881        40
           1      0.842     0.696     0.762        23

    accuracy                          0.841        63
   macro avg      0.842     0.810     0.821        63
weighted avg      0.841     0.841     0.837        63



## Extract tfidf features and test a lot of classifiers

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_features(df):
    """
        X_train, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test set and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')

    tfidfs = tfidf_vectorizer.fit_transform(df['text'])
    df_tfidf = pd.DataFrame(tfidfs.toarray(), columns=tfidf_vectorizer.get_feature_names())
    return df_tfidf

#df1 = 
df_tfidf = tfidf_features(df)
#print(tfidf_reversed_vocab)
#for i in range(df_tfidf.shape[1]):
#  df['tfidf_{}'.format(i)] = df_tfidf[:, i].toarray()[0]


In [44]:
import csv
print(df_tfidf.shape)
#tf_idf_matrix = df_tfidf.todense()
#tf_idf_df = pd.DataFrame(tf_idf_matrix)
 
#tf_idf_df.columns = tfidf_reversed_vocab
data_top = df_tfidf.head() 
    
# display 
print(data_top)
df_withtfidf = pd.concat([df, df_tfidf], axis=1)

data_top = df_withtfidf.head() 
    
# display 
print(data_top)

(312, 2450)
    #3         &  & cape  & film  & film.  & georgie?  & my    & sony  & the  \
0  0.0  0.000000     0.0     0.0      0.0         0.0   0.0  0.000000    0.0   
1  0.0  0.085007     0.0     0.0      0.0         0.0   0.0  0.086976    0.0   
2  0.0  0.000000     0.0     0.0      0.0         0.0   0.0  0.000000    0.0   
3  0.0  0.000000     0.0     0.0      0.0         0.0   0.0  0.000000    0.0   
4  0.0  0.000000     0.0     0.0      0.0         0.0   0.0  0.000000    0.0   

   & vv  'ask  'ask ben'  'ben  'ben brown'  'feel  'feel good    (funky  \
0   0.0   0.0        0.0   0.0          0.0    0.0         0.0  0.000000   
1   0.0   0.0        0.0   0.0          0.0    0.0         0.0  0.067535   
2   0.0   0.0        0.0   0.0          0.0    0.0         0.0  0.000000   
3   0.0   0.0        0.0   0.0          0.0    0.0         0.0  0.065930   
4   0.0   0.0        0.0   0.0          0.0    0.0         0.0  0.000000   

   (funky jazz       (in  (in landscape)  (part  (

       VideoID                               Title UploaderUsername  \
0  6X1smU5q1mo       Sephora and Ulta Makeup Haul!       Arden Rose   
1  Cp7_1kJw-JQ                ARRIVING IN PARADISE      FunForLouis   
2  CK963DVi6pI  CALLING MY MUM!! - KSI Animated #4              KSI   
3  jiodLpPKTSc                       PROFESSOR DOG      FunForLouis   
4  InLV5EY-um8     A Brief History Of Dodie Yellow    Ryan O'Connor   

           DateUploaded                        VideoLink  Status  \
0  2013-03-04T01:31:31Z  youtube.com/watch?v=6X1smU5q1mo  public   
1  2016-05-19T20:30:00Z  youtube.com/watch?v=Cp7_1kJw-JQ  public   
2  2015-02-07T20:09:02Z  youtube.com/watch?v=CK963DVi6pI  public   
3  2014-06-17T17:00:06Z  youtube.com/watch?v=jiodLpPKTSc  public   
4  2017-04-20T18:38:47Z  youtube.com/watch?v=InLV5EY-um8  public   

                                         Description  \
0  Hope you guys enjoyed!\n\nFollow me on:\n\nPol...   
1  DAY 1117 // 16TH MAY 2016 //  Dallas United St...

## All preds

In [45]:
main_df = df_withtfidf
feature_start, num_ftrs = 17,1496
print(df_withtfidf.shape)

(312, 2467)


In [46]:
all_classifiers = {}
all_scalers = {}
pd.set_option("display.max_rows", None, "display.max_columns", None)
for feature in ['IsCollab','IsGaming']:
  print('='*80)
  print('For {}:'.format(feature))
  print('Before: {}'.format(len(main_df)), end=' ')
  df = main_df[main_df[feature].notna()]
  print('After: {}'.format(len(df)))
  print('\nClass distribution:')
  print(df[feature].value_counts())
  print()
  vc = dict(df[feature].value_counts())
  minority_class = min(vc, key=vc.get)
  X = df.iloc[:, feature_start:feature_start+num_ftrs]
  y = df[feature]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)

  classifiers = {'logreg': LogisticRegression(class_weight='balanced'), 
               'knn': KNeighborsClassifier(), 
               'gaussianNB': GaussianNB(),
               'bernoulliNB': BernoulliNB(),
               'adaboost': AdaBoostClassifier(), 
               'grad-boost': GradientBoostingClassifier(),
               'dec-tree': DecisionTreeClassifier(), 
               'linear-svc': LinearSVC(class_weight='balanced'), 
               'c-svc': SVC(class_weight='balanced'), 
               'lda': LinearDiscriminantAnalysis()}
  report = []
  all_classifiers[feature] = classifiers
  all_scalers[feature] = scaler

  

  for method, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    report.append([method, 
                 metrics.accuracy_score(y_test, y_pred), 
                 metrics.f1_score(y_test, y_pred),
                 metrics.precision_score(y_test, y_pred),
                 metrics.recall_score(y_test, y_pred), 
                 metrics.f1_score(y_test, y_pred, average='macro'),
                 classification_report(y_test, y_pred, output_dict=True)[str(minority_class)]['f1-score']])
    
    # print('-'*60)
    # print('{:^60}'.format(method))
    # print('-'*60)
    # print(classification_report(y_test, y_pred, digits=3))
    # print()
    
  report = pd.DataFrame(report, columns = ['method', 'accuracy', 'f1', 'precision', 'recall', 'macro-f1', 'minority-f1'])
  print(report)
  print('='*80, '\n\n')

For IsCollab:
Before: 312 After: 312

Class distribution:
0    172
1    140
Name: IsCollab, dtype: int64

        method  accuracy        f1  precision    recall  macro-f1  minority-f1
0       logreg  0.682540  0.696970   0.696970  0.696970  0.681818     0.696970
1          knn  0.761905  0.754098   0.821429  0.696970  0.761665     0.754098
2   gaussianNB  0.682540  0.655172   0.760000  0.575758  0.680527     0.655172
3  bernoulliNB  0.777778  0.740741   0.952381  0.606061  0.773148     0.740741
4     adaboost  0.761905  0.745763   0.846154  0.666667  0.760941     0.745763
5   grad-boost  0.761905  0.754098   0.821429  0.696970  0.761665     0.754098
6     dec-tree  0.730159  0.721311   0.785714  0.666667  0.729887     0.721311
7   linear-svc  0.666667  0.676923   0.687500  0.666667  0.666330     0.676923
8        c-svc  0.746032  0.757576   0.757576  0.757576  0.745455     0.757576
9          lda  0.666667  0.695652   0.666667  0.727273  0.663616     0.695652


For IsGaming:
Before: 3

In [47]:

best_classifiers = {'IsCollab': 'knn', 
                    'IsGaming': 'knn'}

## Predicting for new data: generate tfidf features again

In [48]:
#predicting for new data
#1. generate tfidf features again for your unlabeled dataset
unlabeled_df = pd.read_csv(UNLABELED_FILE_NAME, encoding='UTF-8')
print(unlabeled_df.shape)
unlabeled_df_tfidf = tfidf_features(unlabeled_df)
import csv
print(unlabeled_df_tfidf.shape)
#tf_idf_matrix = df_tfidf.todense()
#tf_idf_df = pd.DataFrame(tf_idf_matrix)
 
#tf_idf_df.columns = tfidf_reversed_vocab

## keep only those tfidf features we saw last time
intersect = list(set(df_tfidf.columns).intersection(unlabeled_df_tfidf.columns))
unlabeled_df_tfidf = unlabeled_df_tfidf[intersect]
unlabeled_df_withtfidf = pd.concat([unlabeled_df, unlabeled_df_tfidf], axis=1)

# X = np.random.rand(15783, num_ftrs)
##some features are missing
df_newdata = unlabeled_df_withtfidf.reindex(labels=df_withtfidf.columns,axis=1)
df_newdata =df_newdata.fillna(0)
print(df_newdata.shape)
print(df_newdata.columns)

(20489, 13)
(20489, 48450)
(20489, 2467)
Index(['VideoID', 'Title', 'UploaderUsername', 'DateUploaded', 'VideoLink',
       'Status', 'Description', 'ThumbnailLink', 'UploaderId', 'ViewCount',
       ...
       '✩ beme', '✩ blog/website', '✩ facebook', '✩ instagram', '✩ snapchat!',
       '✩ spotify', '✩ tumblr', '✩ twitter', 'music:',
       'music: https://itunes.apple.com/profile/emmablackery'],
      dtype='object', length=2467)


## Predicting for new data: run trained classifiers on new data

In [49]:

feature_start, num_ftrs = 16,1496
print("Running prediction for new data")
X = df_newdata.iloc[:, feature_start:feature_start+num_ftrs]
print(len(X))
print(df_newdata.shape)
for feature in ['IsCollab','IsGaming']:
    X_scaled = all_scalers[feature].transform(X)
    y = all_classifiers[feature][best_classifiers[feature]].predict(X_scaled)
    unlabeled_df['predicted_' + feature] = y

Running prediction for new data
20489
(20489, 2467)


In [50]:
#print the file somewhere
unlabeled_df.to_csv(OUTPUT_CSV_FILE_PATH, index=False, encoding='utf-8-sig')