<a href="https://colab.research.google.com/github/cccaaannn/machine_learning_colab/blob/master/document_classification/data_mining_hw6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download and unzip dataset

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip
!unzip YouTube-Spam-Collection-v1.zip

Imports

In [None]:
# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# other
import pandas as pd 
import numpy as np
import re

Load dataset

In [3]:
Psy = pd.read_csv("Youtube01-Psy.csv")
KatyPerry = pd.read_csv("Youtube02-KatyPerry.csv")
LMFAO = pd.read_csv("Youtube03-LMFAO.csv")
Eminem = pd.read_csv("Youtube04-Eminem.csv")
Shakira = pd.read_csv("Youtube05-Shakira.csv")

dfs = [Psy, KatyPerry, LMFAO, Eminem, Shakira]

combined_df = pd.concat(dfs)

Dataset info

In [None]:
combined_df.head()

In [None]:
combined_df.info()

Helper functions

In [4]:
def test_and_report(model, X_test, y_test):
    pred = model.predict(X_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))
    print("accuracy: {}".format(accuracy_score(y_test, pred)))

In [5]:
def stemming(word):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    word = str(word)
    if word == word.title():
        word = stemmer.stem(word).capitalize()
        word = lemmatizer.lemmatize(word).capitalize()
    elif word.isupper():
        word = stemmer.stem(word).upper()
        word = lemmatizer.lemmatize(word).upper()
    else:
        word = stemmer.stem(word)
        word = lemmatizer.lemmatize(word)
    return word

In [6]:
def rm_stop_words(word, case_sensitive=False):
    if(word.lower() in stopwords.words("english")):
        return ""
    else:
        if(case_sensitive):
            return word
        else:
            return word.lower()

In [7]:
def process_comments(raw_comments, special=False, single=True, nums=True, lowercase =True, stem=True, rm_stops=True): 
    comments = []
    for comment in raw_comments:

        # remove special characters
        if(special):
            comment = re.sub(r'\W', ' ', comment)

        # remove single characters
        if(single):
            comment = re.sub(r'\s+[a-zA-Z]\s+', ' ', comment)

        # remove numbers
        if(nums):
            comment = ''.join([i for i in comment if not i.isdigit()])

        # to lowercase
        if(lowercase):
            comment = comment.lower()

        if(stem):
            comment = comment.split()
            comment = [stemming(t) for t in comment]
            comment = ' '.join(comment)
        
        if(rm_stops):
            comment = comment.split()
            comment = [rm_stop_words(t, case_sensitive=not lowercase) for t in comment]
            comment = ' '.join(comment)

        comments.append(comment)

    return comments

Chose dataset

In [8]:
comments = process_comments(combined_df["CONTENT"], lowercase=False)
y = combined_df["CLASS"]

# comments = process_comments(combined_df["CONTENT"])
# y = combined_df["CLASS"]

# comments = process_comments(Eminem["CONTENT"])
# y = Eminem["CLASS"]

Vectorizers

In [None]:
count_vectorizer = CountVectorizer(max_features=850, min_df=1, stop_words=stopwords.words("english"))

count_model = count_vectorizer.fit(comments)
x = count_model.transform(comments)
print(x.shape)
print(count_model.get_feature_names())

In [None]:
# tfidf_vectorizer = TfidfVectorizer(max_features=250, min_df=5, stop_words=stopwords.words("english"), lowercase=False)
tfidf_vectorizer = TfidfVectorizer(max_features=850, min_df=1, stop_words=stopwords.words("english"))

tfidf_model = tfidf_vectorizer.fit(comments)
x = tfidf_model.transform(comments)
print(x.shape)
print(tfidf_model.get_feature_names())

Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)
print(X_train.shape, X_test.shape)

Training

In [None]:
one_vs_all = OneVsRestClassifier(LogisticRegression())
one_vs_all.fit(X_train, y_train)

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

Testing

In [None]:
print("one vs rest")
test_and_report(one_vs_all, X_test, y_test)

print("random forest")
test_and_report(random_forest, X_test, y_test)

Testing model agains other singers (for not combined training)

In [None]:
for df in dfs:
    test = process_comments(df["CONTENT"])
    test = tfidf_model.transform(test)

    print("one vs all\n")
    test_and_report(one_vs_all, test, df["CLASS"])
    print("\nrandom forest\n")
    test_and_report(random_forest, test, df["CLASS"])
    print("\n"+"-"*50+"\n")