In [295]:
import pandas as pd
import numpy as np
import matplotlib as plt
import plotly
import os
import string

import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
from collections import defaultdict

from functools import reduce

from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize


from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

In [45]:
data = pd.read_csv('train_data.csv')

In [46]:
data.head()

Unnamed: 0,id,about,comments,date_added,name,origin,origin_place,origin_year,other_text,photos,status,tags,type,videos,target_variable
0,0,Gremlin D.Va is a chibi-style cartoon caricatu...,62,"Feb 18, 2020 at 03:21PM EST",Gremlin D.Va,"In Overwatch, the playable character Hana Song...",Tumblr,2016,"Spread On June 2nd, Tumblr user fumeknight pos...",114,Confirmed,"fandom, doritos, mountain dew, gamer, gremlin,...",Character,6,460032
1,1,Note: this entry is very W.I.P feel free to re...,9,"Jan 16, 2020 at 10:37AM EST",Awkward Zombie,"Throughout 2005 and 2006, posted comics and f...",Katie Tiedrich,2005,"Throughout 2005 and 2006, posted comics and f...",160,Submission,"webcomic, video games, weekly, nintendo, smash...",Webcomic,1,12754
2,2,It comes from It’s Always Sunny in Philadelphi...,1,"Jun 16, 2019 at 03:11AM EDT",We all have cats that we'd like to be playing ...,It comes from It’s Always Sunny in Philadelphi...,It's Always Sunny in Philadelphia,Unknown,"Spread It has morphed into a snowclone, typica...",3,Submission,"snowclone, it's always sunny in philadelphia",Pop Culture Reference,0,727
3,3,Microsoft Songsmith is an application by Micro...,8,"Mar 25, 2016 at 06:44AM EDT",Songsmith,"On January 8th, Microsoft released the applica...",YouTube,2009,"The video cuts to the next scene, which is in ...",0,Confirmed,"audio, youtube, microsoft, autotune",Remix,0,13625
4,4,,70,"Oct 10, 2019 at 09:52AM EDT",What's All This Racket? / Mirada Fija,The 4-pane comic series was conceived on 4chan...,4chan,2009,The 4-pane comic series was conceived on 4chan...,99,Confirmed,"exploitable, 4pane, comic, racket, disappointm...",,3,624227


### Remove about, other text, name, origin because no value added

In [54]:
### Check if the ids are in order

data['id'].sum()/sum(i for i in range(len(data)))

1.0

In [55]:
### Check for Nans

data.isna().sum()

id                    0
about              1147
comments              0
date_added            0
name                  0
origin             4349
origin_place          2
origin_year           0
other_text         1477
photos                0
status                0
tags                  1
type               6011
videos                0
target_variable       0
dtype: int64

In [290]:
## Focus on tags for now

data[data['tags'].isna()]

Unnamed: 0,id,about,comments,date_added,name,origin,origin_place,origin_year,other_text,photos,status,tags,type,videos,target_variable


In [57]:
### Drop this row for tag analysis

data.drop(12544,axis=0,inplace=True)

### Create a new dataframe where we have as the row the tag and as columns every other tag and how many times they appear together

In [204]:
len(data)

13187

In [249]:
tag_set = set()

for tags in data['tags']:
    for ind_tag in tags.replace(" ","").replace(".","").split(","):
        tag_set.add(ind_tag.replace(".",""))
        
len(tag_set)

39197

In [308]:
tags_df = pd.DataFrame(zip(list(tag_set),[i for i in range(len(tag_set))]), columns=['tag','id'])

In [309]:
tags_df

Unnamed: 0,tag,id
0,,0
1,joeyholt,1
2,freddiehighmore,2
3,monkas,3
4,viviangomez35977,4
...,...,...
39192,bound4earth,39192
39193,jokerface,39193
39194,@joycewhitevance,39194
39195,televisionquote,39195


In [310]:
tags_df.set_index('tag',inplace=True)

In [311]:
tags_df

Unnamed: 0_level_0,id
tag,Unnamed: 1_level_1
,0
joeyholt,1
freddiehighmore,2
monkas,3
viviangomez35977,4
...,...
bound4earth,39192
jokerface,39193
@joycewhitevance,39194
televisionquote,39195


In [261]:
tag_dict = {}

for tag,value in tags_df.iterrows():
    tag_dict[tag] = list(value.values)[0]

In [263]:
vectorizer = CountVectorizer(vocabulary = tag_dict)

In [278]:
tag_corpus = []
for tags in data['tags']:
    tag_corpus.append(tags.replace(" ","").replace(".",""))

In [299]:
tag_vectorizer = vectorizer.fit_transform(tag_corpus).toarray()

In [301]:
tag_tag = pd.DataFrame(tag_vectorizer)

In [302]:
tag_tag.shape

(13187, 39197)

In [303]:
normalized_tag_distance = normalize(tag_tag.values, axis=0)

In [304]:
tag_tag_similarity = normalized_tag_distance.T.dot(normalized_tag_distance)
tag_tag_similarity = pd.DataFrame(tag_tag_similarity,
                                 index = tag_tag.columns,
                                 columns = tag_tag.columns)

In [307]:
tag_tag_similarity.sum()

0        0.000000
1        5.210096
2        2.273333
3        4.024645
4        9.661660
           ...   
39192    6.156014
39193    4.189723
39194    0.000000
39195    4.022858
39196    2.497202
Length: 39197, dtype: float64

In [313]:
tag_tag_similarity.shape


(39197, 39197)

In [315]:
tags_df.iloc[39194]

id    39194
Name: @joycewhitevance, dtype: int64

In [316]:
tag_tag_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39187,39188,39189,39190,39191,39192,39193,39194,39195,39196
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
39193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
39194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Generalized model for validating

### This code block can try Different models and will draw a boxplot to compare. Replace these model names with the ones we need

### Will also need to import the necessary model haven't done that yet

In [None]:
# to feed the random state
seed = 7

# prepare models
models = []
#models.append(('LR', LogisticRegression()))
#models.append(('LDA', LinearDiscriminantAnalysis()))
#models.append(('KNN', KNeighborsClassifier()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))

### Dataset is large, heavy computation on SVR, don;t do it
#models.append(('SVM', SVC(gamma='auto')))


#models.append(('XGB', XGBClassifier())) - Wasn't able to run this one, this is an ensemble one that we should definitely try

# evaluate each model in turn
results = []
names = []
scoring = 'precision'

for name, model in models:
        kfold = KFold(n_splits=10, random_state=seed)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
# boxplot algorithm comparison
fig = plt.figure(figsize=(11,6))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()