# Logistic Regression

### Import libraries

In [1]:
import pandas as pd # one-hot encoding
import numpy as np
import glob
import os

import matplotlib.pyplot as plt # graphs
import matplotlib.colors as colors

from collections import defaultdict

import pickle
import sklearn
from sklearn.utils import resample # downsample dataset
from sklearn.model_selection import train_test_split # split to training and testing datasets
from sklearn.model_selection import GridSearchCV # cross validation
#from sklearn.preprocessing import scale # scale and center data
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

### Import data

We work with 1956 comments from 5 different YouTube videos. The [YouTube Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection#) is freely available.

In [2]:
path = r"data/YouTube-Spam-Collection/"
files = glob.glob(os.path.join(path, "*.csv"))

corpus = pd.concat((pd.read_csv(file) for file in files), ignore_index=True)
corpus.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [3]:
assert len(corpus) == 1956

### Peprocessing

In [4]:
#nltk.download('stopwords')
#nltk.download('omw-1.4')
    
def preprocess_data(corpus,
                    columns=["CONTENT"],
                    irrelevant_features=["COMMENT_ID", "AUTHOR", "DATE"],
                    #rename_colunms={"CONTENT":"COMMENT"}
                   ):

    #for column in columns:
        # remove blank rows if any
        #corpus[column].dropna(inplace=True)
        
        # lower case
        #corpus[column] = [entry.lower() for entry in corpus[column]]

    # drop irrelevant features
    corpus.drop(irrelevant_features, inplace=True, axis=1)

    # change column name
    #for old, new in rename_columns:
        #corpus.rename({old : new}, axis=1, inplace=True)

    cleaned_data = []
    lemma = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    
    for comment in corpus["CONTENT"]:    
        comment = nltk.word_tokenize(comment.lower()) # tokenizing nltk.WordPunctTokenizer().tokenize(comment.lower())?
        comment = [lemma.lemmatize(word) for word in comment] # lemmatizing
        comment = [word for word in comment if word not in stop_words] # removing stopwords
        comment = " ".join(comment)
        cleaned_data.append(comment)
    
    return cleaned_data

In [5]:
cleaned_data = preprocess_data(corpus)
print(cleaned_data)



In [6]:
# binary feature representation
vectorizer = CountVectorizer(binary=True, max_df=0.95) #max_features=10000, tokenizer=lambda doc: doc)
BOW = vectorizer.fit_transform(cleaned_data)

# count based feature representation
vectorizer_2 = CountVectorizer(binary=False, max_df=0.95) #max_features=10000)
BOW_2 = vectorizer_2.fit_transform(cleaned_data)

# bag of 2-Grams
bigram_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[2,2])
BOW_3 = bigram_vectorizer.fit_transform(cleaned_data)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_df=0.95) #max_features=10000)
tfidf_voc = tfidf_vectorizer.fit_transform(cleaned_data)

### Statistics about the data

In [7]:
print(f"Data types:\n{corpus.dtypes}\n")
print(f"There are {len(corpus['CLASS'].unique())} comment types: {corpus['CLASS'].unique()}")
print(f"The dataset contains of {len(corpus)} examples: {len(corpus.loc[corpus['CLASS'] == 1])} spam and {len(corpus.loc[corpus['CLASS'] == 0])} legitimate comments")

Data types:
CONTENT    object
CLASS       int64
dtype: object

There are 2 comment types: [1 0]
The dataset contains of 1956 examples: 1005 spam and 951 legitimate comments


### Split the data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(BOW,
                                                     np.asarray(corpus["CLASS"]),
                                                     test_size=0.3, # default is 0.25
                                                     random_state=42,
                                                     #stratify=y,  # if imbalanced dataset
                                                     shuffle=True
                                                    )

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1369, 4194)
(587, 4194)
(1369,)
(587,)


### Logistic Regression model

In [10]:
log_reg = LogisticRegression(C=1.0) #verbose=1, solver='liblinear',random_state=0, penalty='l2',max_iter=1000)
model = log_reg.fit(X_train, y_train)

score = model.score(X_test, y_test)
print(f"Test score: {score}")

Test score: 0.9557069846678024


In [11]:
param_grid_ = {'C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}
clf = GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print(f"Best parameters set: {clf.best_params_}")
print(f"Test score: {score}")

Best parameters set: {'C': 10.0}
Test score: 0.9557069846678024


In [12]:
model_output_path = 'saved_models/log_reg_clf.sav'

# save model to disk
pickle.dump(model, open(model_output_path, 'wb+'))

In [13]:
def load_saved_model(model_path):
    clf = pickle.load(open(model_path, 'rb'))
    print(clf)
    return clf

In [14]:
clf = load_saved_model(model_output_path)

LogisticRegression()
