# Exploration 2
Time to graph out some visualizations about the data.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

## Reading Datasets

In [2]:
df = pd.read_csv('../data/dataset.csv')

In [3]:
df.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)

In [4]:
df.head()

Unnamed: 0,profile,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,Shounen,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,DesolatePsyche,8.0,8.0,5.0,7.0,9.0,2.0,0.0,0.0,5.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,baekbeans,6.0,5.0,3.0,2.0,2.0,4.0,0.0,0.0,5.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,skrn,3.0,3.0,3.0,1.0,5.0,3.0,0.0,0.0,2.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,edgewalker00,3.0,3.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,aManOfCulture99,7.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
animes = pd.read_csv('../data/animes_clean.csv')

In [6]:
animes.head()

Unnamed: 0,uid,title,episodes,members,score,Comedy,Action,Fantasy,Adventure,Drama,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,9317,Doll Saaya,1.0,609,4.61,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,38339,Suzumi-bune,1.0,137,5.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,39731,Na Bbeun Sang Sa,1.0,149,5.61,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
3,40131,Junjou Juugeki Cosplay Shoujo,1.0,117,3.95,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5569,Tsui no Sora,1.0,1821,2.84,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Checking for duplicates and removing them
There are still some duplicated data in the user's genre dataframe. Not sure where it came from, but they should be removed.

In [7]:
print(len(df))
print(len(df['profile'].unique()))

65125
37463


In [8]:
df = df.drop_duplicates()

In [9]:
len(df)

37476

In [10]:
print(len(animes))
print(len(animes['uid'].unique()))

15613
15613


## What is Available vs. What People Watch
Lets look at what are the most common genres vs the genres that people watch, as in favorite.

In [11]:
user_genre_sum = df.drop('profile', axis=1).sum()
fig = px.bar(user_genre_sum, title='Cumulative Sum of Genres for All Users')
fig.show()


In [12]:
anime_genre_sum = animes.drop(['uid', 'title', 'episodes', 'members', 'score'], axis=1).sum()
fig = px.bar(anime_genre_sum, title='Cumulative Sum of All Anime Genres')
fig.show()

### Proportion of user favorites in relation to Market Share


In [13]:
user_genre_sum_percent = df.drop('profile', axis=1).mean() / df.drop('profile', axis=1).mean().sum(axis=0)

In [14]:
px.bar(user_genre_sum_percent)

In [15]:
anime_genre_sum_percent = animes.drop(['uid', 'title', 'episodes', 'members', 'score'], axis=1).mean() / animes.drop(['uid', 'title', 'episodes', 'members', 'score'], axis=1).mean().sum(axis=0)

In [16]:
px.bar(anime_genre_sum_percent)

In [17]:
px.bar(user_genre_sum_percent / anime_genre_sum_percent)

## Co-Occurrence of Genres
What genres tend to be together? We will find that by creating a co-occurrence matrix.
Myanimelist has both genres, demographics, and themes available for filtering. Here, we will look at each of these categories individually

In [173]:
co_occ = df.drop('profile', axis=1)
co_occ_cols = co_occ.columns

In [174]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(co_occ)
co_occ = scaler.transform(co_occ)
co_occ = pd.DataFrame(co_occ, columns=co_occ_cols)

In [175]:
co_genre = co_occ.T.dot(co_occ)

In [176]:
co_genre

Unnamed: 0,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,Shounen,Romance,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
Comedy,37476.0,13399.572478,11060.342779,14036.010129,10715.175168,5722.886227,-2028.256498,2780.159309,18066.487166,15030.17199,...,-1577.084682,6068.612506,3425.097695,-1124.001961,483.693923,3022.850967,2706.697484,2552.27091,-96.378648,-554.830494
Action,13399.572478,37476.0,18900.091463,22814.157959,11391.774276,17071.271339,-1718.243027,1488.375504,20997.973978,100.633277,...,6574.423581,9077.830534,5261.176509,4388.601721,3102.841543,-1389.702844,-367.161265,-940.162142,-453.540234,-764.746205
Fantasy,11060.342779,18900.091463,37476.0,23487.280696,10849.941332,3568.971001,-811.707794,4141.225651,14948.342098,7732.326069,...,589.685962,939.014234,1893.495089,2919.728089,-455.794078,498.83546,-39.538362,828.743021,-256.920245,-675.944593
Adventure,14036.010129,22814.157959,23487.280696,37476.0,9571.445598,9975.8974,-1661.399382,4445.81859,18576.511112,2633.718203,...,2614.109474,6333.383204,2225.477084,806.432577,1054.092055,-266.724037,-908.25889,-1218.477227,-360.410952,-589.627072
Drama,10715.175168,11391.774276,10849.941332,9571.445598,37476.0,15251.738992,-2265.521796,75.754,8176.85192,15998.891648,...,5555.888819,1421.231194,2574.980276,9957.692149,1494.009391,6517.978489,1865.259287,1575.799031,-935.952621,-295.070252
Sci-Fi,5722.886227,17071.271339,3568.971001,9975.8974,15251.738992,37476.0,-1407.612019,-73.651541,2165.909407,2107.245215,...,8840.202177,6303.83664,2204.572833,8407.882879,2538.605232,-730.677514,-1958.03055,-66.244647,-351.727311,-539.483148
Hentai,-2028.256498,-1718.243027,-811.707794,-1661.399382,-2265.521796,-1407.612019,37476.0,4.697119,-1983.286445,-1436.784809,...,-793.78801,-598.027348,-465.26862,-1498.100461,-169.544728,-637.411252,-171.185344,348.837948,12107.182217,6941.761573
Kids,2780.159309,1488.375504,4141.225651,4445.81859,75.754,-73.651541,4.697119,37476.0,285.912814,-654.855068,...,-511.707489,-430.065316,-853.974726,-992.273315,142.830818,-235.555847,18.021162,194.555432,286.396832,613.2577
Shounen,18066.487166,20997.973978,14948.342098,18576.511112,8176.85192,2165.909407,-1983.286445,285.912814,37476.0,1095.95607,...,5459.996675,9726.179932,2142.571197,4661.860109,580.638339,-502.619999,423.443088,-1486.098067,-447.043447,-830.422323
Romance,15030.17199,100.633277,7732.326069,2633.718203,15998.891648,2107.245215,-1436.784809,-654.855068,1095.95607,37476.0,...,-2786.729126,-958.685679,4644.993543,-312.446116,-574.264921,4377.628822,3469.876978,2821.692575,-372.171672,83.139542


In [179]:
fig = px.imshow(co_genre, title='Genre Co-Occurrence', height=1000)

fig.show()

# Differences In Preferences Between Genders
Let's separate the users by gender. There may be interesting groupings of preferences that exist between the different genders.

In [23]:
profiles = pd.read_csv('../data/profiles_clean.csv')
profiles = profiles.drop(['Unnamed: 0', 'favorites_anime'], axis=1)

In [24]:
profiles.head()

Unnamed: 0,profile,gender
0,DesolatePsyche,Male
1,baekbeans,Female
2,skrn,
3,edgewalker00,Male
4,aManOfCulture99,Male


In [25]:
print(len(profiles))
print(len(profiles['profile'].unique()))

81727
47885


In [26]:
profiles = profiles.drop_duplicates()

In [27]:
df_gen = profiles.merge(right=df, on='profile', how='inner')

In [28]:
df_gen

Unnamed: 0,profile,gender,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,DesolatePsyche,Male,8.0,8.0,5.0,7.0,9.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,baekbeans,Female,6.0,5.0,3.0,2.0,2.0,4.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,skrn,,3.0,3.0,3.0,1.0,5.0,3.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,edgewalker00,Male,3.0,3.0,2.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,aManOfCulture99,Male,7.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37471,WeissYuki,,2.0,3.0,1.0,3.0,5.0,3.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
37472,Akuteru,Male,1.0,4.0,2.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
37473,Razielek,Male,1.0,2.0,0.0,2.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37474,Mattierial,,2.0,2.0,3.0,2.0,6.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
import matplotlib.pyplot as plt

In [30]:
df_gen['gender'] = df_gen['gender'].fillna('Not Specified')
df_gen = df_gen.drop(df_gen[df_gen['gender'] == 'Non-Binary'].index)

In [367]:
import plotly

In [368]:
plotly.colors.DEFAULT_PLOTLY_COLORS

['rgb(31, 119, 180)',
 'rgb(255, 127, 14)',
 'rgb(44, 160, 44)',
 'rgb(214, 39, 40)',
 'rgb(148, 103, 189)',
 'rgb(140, 86, 75)',
 'rgb(227, 119, 194)',
 'rgb(127, 127, 127)',
 'rgb(188, 189, 34)',
 'rgb(23, 190, 207)']

In [373]:
fig = px.pie(df_gen.groupby('gender').count().reset_index(), names='gender', values='profile', color_discrete_sequence=['#636EFA', '#00CC96', '#EF553B'])
fig.show()

### Missing Gender Entries
What to do with users who have not specified their gender? About a quarter of all users have not listed their gender on their profiles. While there are enough data entries to simply throw out data that doesn't have gender, it would be quite a waste to do so. Instead, these gender values could be predicted using a logistic regression.

In [32]:
df_gen

Unnamed: 0,profile,gender,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,DesolatePsyche,Male,8.0,8.0,5.0,7.0,9.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,baekbeans,Female,6.0,5.0,3.0,2.0,2.0,4.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,skrn,Not Specified,3.0,3.0,3.0,1.0,5.0,3.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,edgewalker00,Male,3.0,3.0,2.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,aManOfCulture99,Male,7.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37471,WeissYuki,Not Specified,2.0,3.0,1.0,3.0,5.0,3.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
37472,Akuteru,Male,1.0,4.0,2.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
37473,Razielek,Male,1.0,2.0,0.0,2.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37474,Mattierial,Not Specified,2.0,2.0,3.0,2.0,6.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

Select the entries with a gender, convert the gender to dummy variable, and then scale the genre counts.

In [48]:
df_gen_pred = df_gen.drop(df_gen[df_gen['gender'] == 'Non-Binary'].index)

In [77]:
df_gen_pred = df_gen.drop(df_gen[df_gen['gender'] == 'Not Specified'].index)

In [68]:
unk_gen = df_gen[df_gen['gender'] == 'Not Specified']

1. Convert gender to dummy variable

In [54]:
def gen_dummy(gender):
    if gender == 'Male':
        return 1
    return 0

In [55]:
df_gen_pred['gender'] = df_gen_pred['gender'].apply(gen_dummy)

2. Scale the genre counts

In [56]:
genders = df_gen_pred['gender']
genders

0        1
1        0
3        1
4        1
7        1
        ..
37468    1
37469    1
37472    1
37473    1
37475    1
Name: gender, Length: 27389, dtype: int64

In [57]:
df_gen_pred = df_gen_pred.drop(['profile', 'gender'], axis=1)

In [58]:
scaler = StandardScaler()
scaler.fit(df_gen_pred)
gen_pred_scaled = scaler.transform(df_gen_pred)

In [59]:
gen_pred_scaled

array([[ 2.77105232,  2.74040081,  2.67978778, ..., -0.1499888 ,
        -0.0249686 , -0.04450795],
       [ 1.72381608,  1.20678201,  1.19992595, ..., -0.1499888 ,
        -0.0249686 , -0.04450795],
       [ 0.15296172,  0.18436947,  0.45999503, ..., -0.1499888 ,
        -0.0249686 , -0.04450795],
       ...,
       [-0.89427452,  0.69557574,  0.45999503, ..., -0.1499888 ,
        -0.0249686 , -0.04450795],
       [-0.89427452, -0.32683679, -1.01986681, ..., -0.1499888 ,
        -0.0249686 , -0.04450795],
       [ 0.15296172, -0.32683679, -1.01986681, ..., -0.1499888 ,
        -0.0249686 , -0.04450795]])

In [60]:
X_train, X_test, y_train, y_test = train_test_split(gen_pred_scaled, genders, test_size=0.3, random_state=42)

In [61]:
gen_clf = LogisticRegression(random_state=42)
gen_clf.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [62]:
y_pred = gen_clf.predict(X_train)

In [63]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [64]:
accuracy_score(y_train, y_pred)

0.7889630711454204

In [65]:
confusion_matrix(y_train, y_pred)

array([[ 2694,  3229],
       [  817, 12432]], dtype=int64)

Accuracy is around 80%. Considering that some people may not be honest about their gender online, I would say this is a decent predictor.

Save the gender logistic regression predictor

In [47]:
import pickle
pickle.dump(gen_clf, open('../models/gender_logreg.pickle', 'wb'))

### Generate values for users with missing gender

2. Scale the genre counts

In [70]:
unk_gen_pred = unk_gen.drop(['profile', 'gender'], axis=1)

In [71]:
scaler = StandardScaler()
scaler.fit(unk_gen_pred)
unk_gen_scaled = scaler.transform(unk_gen_pred)

In [72]:
gen_pred = gen_clf.predict(unk_gen_scaled)

In [73]:
unk_gen['gender'] = gen_pred



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [74]:
unk_gen

Unnamed: 0,profile,gender,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
2,skrn,1,3.0,3.0,3.0,1.0,5.0,3.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,eneri,1,3.0,5.0,1.0,2.0,3.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Waffle_Empress,0,5.0,1.0,2.0,1.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0
13,12sed,1,0.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,Rigas,1,7.0,5.0,4.0,3.0,3.0,1.0,0.0,2.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37463,TokiPui,1,2.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37464,Konin,1,1.0,2.0,2.0,1.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37470,KidCaramel,1,3.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
37471,WeissYuki,1,2.0,3.0,1.0,3.0,5.0,3.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [85]:
kno_gen = df_gen.drop(df_gen[df_gen['gender'] == 'Non-Binary'].index)

In [86]:
kno_gen = df_gen.drop(df_gen[df_gen['gender'] == 'Not Specified'].index)

In [87]:
kno_gen['gender'] = kno_gen['gender'].apply(gen_dummy)

## Rejoining users with known gender and users with unknown gender

In [88]:
kno_gen = pd.concat([kno_gen, unk_gen])

In [92]:
kno_gen = kno_gen.rename(columns={'gender': 'male'})

In [93]:
kno_gen

Unnamed: 0,profile,male,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,DesolatePsyche,1,8.0,8.0,5.0,7.0,9.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,baekbeans,0,6.0,5.0,3.0,2.0,2.0,4.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,edgewalker00,1,3.0,3.0,2.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,aManOfCulture99,1,7.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NIGGER_BONER,1,2.0,7.0,3.0,2.0,1.0,2.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37463,TokiPui,1,2.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37464,Konin,1,1.0,2.0,2.0,1.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37470,KidCaramel,1,3.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
37471,WeissYuki,1,2.0,3.0,1.0,3.0,5.0,3.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


Save the updated dataset for future use.

In [94]:
pickle.dump(kno_gen, open('../data/dataset_gender.pickle', 'wb'))

# Breakdown of updated genders
With the unknown gender values filled out, let's take a look at the gender distribution again.

In [363]:
fig = px.pie(final_df.groupby('male').count().reset_index(), names='male', values='profile')
fig.show()

# Differences between genders

In [99]:
kno_gen

Unnamed: 0,profile,male,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,...,Police,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi
0,DesolatePsyche,1,8.0,8.0,5.0,7.0,9.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,baekbeans,0,6.0,5.0,3.0,2.0,2.0,4.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,edgewalker00,1,3.0,3.0,2.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,aManOfCulture99,1,7.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NIGGER_BONER,1,2.0,7.0,3.0,2.0,1.0,2.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37463,TokiPui,1,2.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37464,Konin,1,1.0,2.0,2.0,1.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37470,KidCaramel,1,3.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
37471,WeissYuki,1,2.0,3.0,1.0,3.0,5.0,3.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
male_genre_sum_percent = kno_gen[kno_gen['male'] == 1].drop(['profile', 'male'], axis=1).mean() / kno_gen.drop(['profile', 'male'], axis=1).mean().sum(axis=0)

In [107]:
fig = px.bar(male_genre_sum_percent, title='Genre Distribution for Males')
fig.show()

In [104]:
female_genre_sum_percent = kno_gen[kno_gen['male'] == 0].drop(['profile', 'male'], axis=1).mean() / kno_gen.drop(['profile', 'male'], axis=1).mean().sum(axis=0)

AttributeError: 'numpy.float64' object has no attribute 'index'

In [145]:
gender_genre_sum_percent = pd.DataFrame(male_genre_sum_percent).T
gender_genre_sum_percent = gender_genre_sum_percent.append(pd.DataFrame(female_genre_sum_percent).T)

In [151]:
gender_genre_sum_percent['gender'] = ['male', 'female']

In [153]:
gender_genre_sum_percent = gender_genre_sum_percent.reset_index().drop('index', axis=1).set_index('gender')

In [156]:
gender_genre_sum_percent = gender_genre_sum_percent.T

In [157]:
gender_genre_melt = pd.melt(gender_genre_sum_percent)

In [165]:
px.bar(gender_genre_sum_percent.apply(lambda x: x / anime_genre_sum_percent), barmode='group')

## Choosing How many clusterings are present
In order to do unsupervised learning, we will need to figure out how many groupings are present in the data. For starters, there must be at least 2 since gender plays a large role as evidenced by the accuracy of the logistic regression. For females, animes listed under female demographics like shoujo, josei, etc. are evidently more favorited than males. As for males, genres that can be attributed to male preferences like sci-fi, space, cars, and others. Considering all this and time constraints for this presentation, I chose 4 for the number of clusters. For each of the genders, there are at least 2 distinct groupings which we will examine individually.

## Clustering with Hierarchical Clustering
The clustering algorithm I chose was hierarchical clustering. I chose hierarchical clustering since the list of genres also includes themes and demographics, which can have a tree like structure. For example, space and mecha are both related to sci-fi.

In [180]:
from sklearn.cluster import AgglomerativeClustering

In [261]:
cluster_train = kno_gen.copy()

In [262]:
final_df = cluster_train.copy()

In [263]:
cluster_train.drop('profile', axis=1, inplace=True)

In [265]:
# cluster_train_male = cluster_train['male']
# cluster_train.drop('male', axis=1, inplace=True)
cluster_train_cols = cluster_train.columns

Use standardscaler to scale data before clustering, since it uses a distance metric. Users with lots of favorites would be very different from users with few favorites otherwise.

In [266]:
# cluster_train #.apply(lambda x: x / x.sum())
scaler = StandardScaler()
scaler.fit(cluster_train)
cluster_train = pd.DataFrame(scaler.transform(cluster_train), columns=cluster_train_cols)

In [255]:
# cluster_train['male'] = cluster_train_male

In [267]:
cluster_train.dropna(inplace=True)

In [236]:
pickle.dump(cluster_train, open('../data/cluster_train.pickle', 'wb'))

In [268]:
agg_clf = AgglomerativeClustering(n_clusters=4)
y_pred = agg_clf.fit(cluster_train)

In [269]:
y_pred.labels_

array([0, 1, 1, ..., 1, 0, 2], dtype=int64)

In [270]:
final_df['group'] = y_pred.labels_

In [271]:
final_df

Unnamed: 0,profile,male,Comedy,Action,Fantasy,Adventure,Drama,Sci-Fi,Hentai,Kids,...,Samurai,Vampire,Thriller,Cars,Josei,Shounen Ai,Shoujo Ai,Yuri,Yaoi,group
0,DesolatePsyche,1,8.0,8.0,5.0,7.0,9.0,2.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,baekbeans,0,6.0,5.0,3.0,2.0,2.0,4.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,edgewalker00,1,3.0,3.0,2.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,aManOfCulture99,1,7.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
7,NIGGER_BONER,1,2.0,7.0,3.0,2.0,1.0,2.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37463,TokiPui,1,2.0,3.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
37464,Konin,1,1.0,2.0,2.0,1.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
37470,KidCaramel,1,3.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
37471,WeissYuki,1,2.0,3.0,1.0,3.0,5.0,3.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [324]:
def dummy_to_gender(male):
    if male == 1:
        return 'Male'
    else:
        return 'Female'

In [325]:
final_df['male'] = final_df['male'].apply(dummy_to_gender)

### Count of user groups

In [319]:
fig = px.pie(final_df.groupby('group').count().reset_index(), names='group', values='profile', title='Distribution of Groups')
fig.show()

In [337]:
fig = px.pie(final_df[final_df['group'] == 0].groupby('male').count().reset_index(), names='male', values='profile', 
        title='Gender of Group 0')
fig.show()

In [349]:
px.bar(final_df[final_df['group'] == 0].drop(['profile', 'group', 'male'], axis=1).sum(), color_discrete_sequence=['#00CC96'])

In [339]:
px.pie(final_df[final_df['group'] == 1].groupby('male').count().reset_index(), names='male', values='profile',
        title='Gender of Group 1')

In [283]:
px.bar(final_df[final_df['group'] == 1].drop(['profile', 'group', 'male'], axis=1).sum())

In [351]:
px.pie(final_df[final_df['group'] == 2].groupby('male').count().reset_index(), names='male', values='profile',
        title='Gender of Group 2')

In [356]:
px.bar(final_df[final_df['group'] == 2].drop(['profile', 'group', 'male'], axis=1).sum(), color_discrete_sequence=['#EF553B'])

In [341]:
px.pie(final_df[final_df['group'] == 3].groupby('male').count().reset_index(), names='male', values='profile',
        title='Gender of Group 3')

In [360]:
px.bar(final_df[final_df['group'] == 3].drop(['profile', 'group', 'male'], axis=1).sum(), color_discrete_sequence=['#AB63FA'])


# Ending and saving data

In [364]:
final_df.to_csv('../data/final_data.csv')

In [365]:
pickle.dump(final_df, open('../data/final_data.pickle', 'wb'))