In [55]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ast import literal_eval
import re
import nltk
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
df_anime = pd.read_csv('Anime_data.csv', low_memory=False)

In [57]:
df_anime = df_anime[:10000]

In [58]:
df_anime.describe()

Unnamed: 0,Anime_id,Rating,ScoredBy,Popularity,Members,Episodes
count,10000.0,9986.0,9986.0,9926.0,10000.0,9933.0
mean,12546.5394,6.440038,13661.61,7333.624018,26486.52,13.173462
std,10582.190672,0.99047,48347.23,4315.251361,83135.0,49.416582
min,1.0,1.9,1.0,1.0,2.0,1.0
25%,2964.5,5.8025,96.0,3610.25,374.0,1.0
50%,9006.5,6.5,812.5,7279.0,2165.0,2.0
75%,22059.0,7.16,5719.5,10948.75,13187.0,12.0
max,31994.0,9.25,1006242.0,15637.0,1451708.0,1818.0


In [59]:
def missing_values(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(columns={
        0: 'Missing Values',
        1: '% of Total Values'
    })
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:, 1] != 0].sort_values(
            '% of Total Values', ascending=False).round(1)
    print("Dataframe has " + str(df.shape[1]) + " columns.")
    print("There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")
    
    return mis_val_table_ren_columns

In [60]:
miss_values = missing_values(df_anime)
miss_values

Dataframe has 15 columns.
There are 10 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Producer,3857,38.6
Studio,3710,37.1
Synopsis,339,3.4
Popularity,74,0.7
Aired,74,0.7
Link,74,0.7
Episodes,67,0.7
Genre,29,0.3
Rating,14,0.1
ScoredBy,14,0.1


In [61]:
#drop missing value where rating is missing

df_anime = df_anime.dropna(subset=["Rating"])

In [62]:
#fill missing values using the mean

df_anime['ScoredBy'] = df_anime['ScoredBy'].fillna(df_anime['ScoredBy'].median())
df_anime['Popularity'] = df_anime['Popularity'].fillna(df_anime['Popularity'].median())
df_anime['Members'] = df_anime['Members'].fillna(df_anime['Members'].median())

In [63]:
#drop useless column

df_anime = df_anime.drop('Aired', axis=1)
df_anime = df_anime.drop('Anime_id', axis=1)
df_anime = df_anime.drop('Link', axis=1)
df_anime = df_anime.drop('Source', axis=1)
df_anime = df_anime.drop('Episodes', axis=1)
df_anime = df_anime.drop('Popularity', axis=1)
df_anime = df_anime.drop('ScoredBy', axis=1)
df_anime = df_anime.drop('Members', axis=1)

In [64]:
df_anime

Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio,Rating
0,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81
1,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']","Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41
2,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...","Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31
3,Witch Hunter Robin,"['Action', 'Magic', 'Police', 'Supernatural', ...",Witches are individuals with special powers li...,TV,['Bandai Visual'],['Sunrise'],7.34
4,Bouken Ou Beet,"['Adventure', 'Fantasy', 'Shounen', 'Supernatu...",It is the dark century and the people are suff...,TV,,['Toei Animation'],7.04
...,...,...,...,...,...,...,...
9995,Okusama ga Seitokaichou!: Seitokaichou to Ofur...,"['Comedy', 'Romance', 'Ecchi', 'Shounen']",Bundled with the limited edition ninth manga v...,OVA,,['Seven'],6.89
9996,Lupin Shanshei Pilot,"['Parody', 'Comedy']",,Special,,['TMS Entertainment'],5.69
9997,Hibike! Euphonium 2,"['Music', 'School', 'Drama']",Following their success in the qualifying roun...,TV,"['Lantis', 'Pony Canyon', 'Rakuonsha']",['Kyoto Animation'],8.29
9998,Hibike! Euphonium Movie 1: Kitauji Koukou Suis...,"['Music', 'School']",After swearing off music due to an incident at...,Movie,"['Lantis', 'Shochiku']",['Kyoto Animation'],7.45


In [65]:
miss_values = missing_values(df_anime)
miss_values

Dataframe has 7 columns.
There are 4 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Producer,3846,38.5
Studio,3704,37.1
Synopsis,338,3.4
Genre,29,0.3


In [66]:
df_anime['Synopsis'] = df_anime['Synopsis'].fillna("unknown")

In [67]:
df_anime

Unnamed: 0,Title,Genre,Synopsis,Type,Producer,Studio,Rating
0,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81
1,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']","Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41
2,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...","Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31
3,Witch Hunter Robin,"['Action', 'Magic', 'Police', 'Supernatural', ...",Witches are individuals with special powers li...,TV,['Bandai Visual'],['Sunrise'],7.34
4,Bouken Ou Beet,"['Adventure', 'Fantasy', 'Shounen', 'Supernatu...",It is the dark century and the people are suff...,TV,,['Toei Animation'],7.04
...,...,...,...,...,...,...,...
9995,Okusama ga Seitokaichou!: Seitokaichou to Ofur...,"['Comedy', 'Romance', 'Ecchi', 'Shounen']",Bundled with the limited edition ninth manga v...,OVA,,['Seven'],6.89
9996,Lupin Shanshei Pilot,"['Parody', 'Comedy']",unknown,Special,,['TMS Entertainment'],5.69
9997,Hibike! Euphonium 2,"['Music', 'School', 'Drama']",Following their success in the qualifying roun...,TV,"['Lantis', 'Pony Canyon', 'Rakuonsha']",['Kyoto Animation'],8.29
9998,Hibike! Euphonium Movie 1: Kitauji Koukou Suis...,"['Music', 'School']",After swearing off music due to an incident at...,Movie,"['Lantis', 'Shochiku']",['Kyoto Animation'],7.45


# Column "Genre"

In [68]:
df_anime['Genre'] = df_anime['Genre'].fillna("['unknown']")

df_anime['Genre'] = df_anime['Genre'].str.replace('[', '')
df_anime['Genre'] = df_anime['Genre'].str.replace(']', '')
df_anime['Genre'] = df_anime['Genre'].str.replace("'", '')

  df_anime['Genre'] = df_anime['Genre'].str.replace('[', '')
  df_anime['Genre'] = df_anime['Genre'].str.replace(']', '')


In [69]:
dummies = df_anime["Genre"].str.get_dummies(', ').add_prefix('genre_')
df_anime = pd.concat([df_anime, dummies], axis=1)

In [70]:
df_anime = df_anime.drop('Genre', axis=1)

In [71]:
df_anime

Unnamed: 0,Title,Synopsis,Type,Producer,Studio,Rating,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,...,genre_Slice of Life,genre_Space,genre_Sports,genre_Super Power,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,genre_Yuri,genre_unknown
0,Cowboy Bebop,"In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81,1,1,0,1,...,0,1,0,0,0,0,0,0,0,0
1,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Witch Hunter Robin,Witches are individuals with special powers li...,TV,['Bandai Visual'],['Sunrise'],7.34,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Bouken Ou Beet,It is the dark century and the people are suff...,TV,,['Toei Animation'],7.04,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Okusama ga Seitokaichou!: Seitokaichou to Ofur...,Bundled with the limited edition ninth manga v...,OVA,,['Seven'],6.89,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9996,Lupin Shanshei Pilot,unknown,Special,,['TMS Entertainment'],5.69,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9997,Hibike! Euphonium 2,Following their success in the qualifying roun...,TV,"['Lantis', 'Pony Canyon', 'Rakuonsha']",['Kyoto Animation'],8.29,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,Hibike! Euphonium Movie 1: Kitauji Koukou Suis...,After swearing off music due to an incident at...,Movie,"['Lantis', 'Shochiku']",['Kyoto Animation'],7.45,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Column Producer

In [72]:
df_anime['Producer'] = df_anime['Producer'].fillna("['unknown']")
df_anime['Producer'] = df_anime['Producer'].str.replace('[', '')
df_anime['Producer'] = df_anime['Producer'].str.replace(']', '')
df_anime['Producer'] = df_anime['Producer'].str.replace("'", '')

  df_anime['Producer'] = df_anime['Producer'].str.replace('[', '')
  df_anime['Producer'] = df_anime['Producer'].str.replace(']', '')


In [73]:
dummies = df_anime["Producer"].str.get_dummies(', ').add_prefix('producer_')
df_anime = pd.concat([df_anime, dummies], axis=1)

In [74]:
df_anime = df_anime.drop('Producer', axis=1)

In [75]:
df_anime

Unnamed: 0,Title,Synopsis,Type,Studio,Rating,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,...,producer_feng,producer_flying DOG,producer_gimik,producer_i0+,producer_iQIYI,producer_indigo line,producer_m.o.e.,producer_teamKG,producer_tsuritama partners,producer_unknown
0,Cowboy Bebop,"In the year 2071, humanity has colonized sever...",TV,['Sunrise'],8.81,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Cowboy Bebop: Tengoku no Tobira,"Another day, another bounty—such is the life o...",Movie,['Bones'],8.41,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Trigun,"Vash the Stampede is the man with a $$60,000,0...",TV,['Madhouse'],8.31,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Witch Hunter Robin,Witches are individuals with special powers li...,TV,['Sunrise'],7.34,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bouken Ou Beet,It is the dark century and the people are suff...,TV,['Toei Animation'],7.04,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Okusama ga Seitokaichou!: Seitokaichou to Ofur...,Bundled with the limited edition ninth manga v...,OVA,['Seven'],6.89,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
9996,Lupin Shanshei Pilot,unknown,Special,['TMS Entertainment'],5.69,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
9997,Hibike! Euphonium 2,Following their success in the qualifying roun...,TV,['Kyoto Animation'],8.29,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,Hibike! Euphonium Movie 1: Kitauji Koukou Suis...,After swearing off music due to an incident at...,Movie,['Kyoto Animation'],7.45,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Column Studio

In [76]:
df_anime['Studio'] = df_anime['Studio'].fillna("['unknown']")
df_anime['Studio'] = df_anime['Studio'].str.replace('[', '')
df_anime['Studio'] = df_anime['Studio'].str.replace(']', '')
df_anime['Studio'] = df_anime['Studio'].str.replace("'", '')

  df_anime['Studio'] = df_anime['Studio'].str.replace('[', '')
  df_anime['Studio'] = df_anime['Studio'].str.replace(']', '')


In [77]:
dummies = df_anime["Studio"].str.get_dummies(', ').add_prefix('studio_')
df_anime = pd.concat([df_anime, dummies], axis=1)
df_anime = df_anime.drop('Studio', axis=1)

# Column Type

In [78]:
dummies = df_anime["Type"].str.get_dummies(', ').add_prefix('type_')
df_anime = pd.concat([df_anime, dummies], axis=1)
df_anime = df_anime.drop('Type', axis=1)

# Column Synopsis

In [79]:
#df_synospsis['Synopsis'] = df_synospsis['Synopsis'].apply(preprocess_text)

In [80]:
df_temp = df_anime[['Title', 'Synopsis']]

In [81]:
rem_nombre = r'\d+'
df_temp["Synopsis"] = df_temp["Synopsis"].apply(lambda x: re.sub(r'\s*(\r\n\s*)+|\[Written by .*?\]+|\(Source: .*?\)\s*', '', x))
df_temp["Synopsis"] = df_temp["Synopsis"].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
df_temp = df_temp.apply(lambda x : x.astype(str).str.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp["Synopsis"] = df_temp["Synopsis"].apply(lambda x: re.sub(r'\s*(\r\n\s*)+|\[Written by .*?\]+|\(Source: .*?\)\s*', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp["Synopsis"] = df_temp["Synopsis"].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))


In [82]:
df_temp["Synopsis"] = df_temp["Synopsis"].apply(lambda x : word_tokenize(x))

In [83]:
df_temp["Synopsis"] = df_temp["Synopsis"].apply(lambda words : [word for word in words if word not in nltk.corpus.stopwords.words('english')])

In [84]:
model_wc = Word2Vec(df_temp["Synopsis"], vector_size=100, window=5, min_count=1, workers=4)

In [85]:
def get_vector(text):
    tokens = [token for token in text]
    vectors = [model_wc.wv[token] for token in tokens if token in model_wc.wv]
    return sum(vectors)/len(vectors)

In [86]:
df_temp["Synopsis"] = df_temp["Synopsis"].apply(get_vector)
df_anime["Synopsis"] = df_temp["Synopsis"]

In [87]:
df_anime['Synopsis'] = df_anime['Synopsis'].apply(lambda x : x.tolist())

# Column Title

In [88]:
df_temp["Title"] = df_temp["Title"].apply(lambda x: re.sub(r'\s*(\r\n\s*)+|\[Written by .*?\]+|\(Source: .*?\)\s*', '', x))
df_temp["Title"] = df_temp["Title"].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

In [89]:
df_temp["Title"] = df_temp["Title"].apply(lambda x : word_tokenize(x))

In [90]:
df_temp["Title"] = df_temp["Title"].apply(lambda words : [word for word in words if word not in nltk.corpus.stopwords.words('english')])

In [91]:
model_wc = Word2Vec(df_temp['Title'], vector_size=100, window=5, min_count=1, workers=4)

In [92]:
df_anime["Synopsis"].shape

(9986,)

In [93]:
def get_title_vector(title):
    words = title.lower().split()
    vectors = [model_wc.wv[word] for word in words if word in model_wc.wv.key_to_index]
    if len(vectors) > 0:
        return sum(vectors) / len(vectors)
    else:
        return np.zeros(model_wc.vector_size)

In [94]:
df_anime["Title"] = df_anime["Title"].apply(get_title_vector)

In [95]:
scaler = MinMaxScaler()

In [96]:
df_anime["Title"]

0       [-0.0021535645, 0.005331165, 0.0034140975, 0.0...
1       [-0.0034563055, 0.0027027002, 0.0041556074, 0....
2       [-0.0061542904, -3.8414695e-05, 0.0069717416, ...
3       [0.007515982, 0.001826553, 0.0019359732, -0.00...
4       [0.0008875814, 0.0029268626, -0.0029157682, -0...
                              ...                        
9995    [-0.0027930741, 0.001632675, -0.0022314587, -0...
9996    [-0.0027594443, -0.008280367, -0.0040012714, 0...
9997    [-0.011100721, 0.008747485, 0.004837468, -0.00...
9998    [0.00026614065, 0.00836071, 0.00575299, 0.0008...
9999    [-0.0023936694, 0.00013821565, 0.0020302765, -...
Name: Title, Length: 9986, dtype: object

In [97]:
df_anime["Title"] = scaler.fit_transform(df_anime["Title"].tolist())

In [98]:
print(df_anime.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9986 entries, 0 to 9999
Columns: 1332 entries, Title to type_TV
dtypes: float64(2), int64(1329), object(1)
memory usage: 101.6+ MB
None


In [99]:
df_anime

Unnamed: 0,Title,Synopsis,Rating,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,genre_Demons,genre_Drama,...,studio_ixtl,studio_teamKG,studio_ufotable,studio_unknown,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0.513510,"[-0.22871875762939453, 0.4912940263748169, 0.0...",8.81,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,0.461968,"[-0.1771565079689026, 0.4537254571914673, 0.03...",8.41,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.355224,"[-0.17196500301361084, 0.40078845620155334, 0....",8.31,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,0.896079,"[-0.26507359743118286, 0.5655261278152466, 0.0...",7.34,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0.633831,"[-0.19959399104118347, 0.5927262902259827, 0.0...",7.04,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.488208,"[-0.6608385443687439, 0.7791542410850525, 0.45...",6.89,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9996,0.489538,"[-0.35587775707244873, 0.7945708632469177, 0.1...",5.69,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9997,0.159521,"[-0.2327313870191574, 0.500264585018158, 0.005...",8.29,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
9998,0.609244,"[-0.248906672000885, 0.521472692489624, -0.014...",7.45,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [100]:
df_anime = df_anime.drop('Synopsis', axis=1)

# Output

In [109]:
df_anime.to_csv('output_data_anime.csv')

In [110]:
df_anime

Unnamed: 0,Title,Rating,genre_Action,genre_Adventure,genre_Cars,genre_Comedy,genre_Dementia,genre_Demons,genre_Drama,genre_Ecchi,...,studio_ixtl,studio_teamKG,studio_ufotable,studio_unknown,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0.513510,8.81,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0.461968,8.41,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0.355224,8.31,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0.896079,7.34,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,0.633831,7.04,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.488208,6.89,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
9996,0.489538,5.69,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9997,0.159521,8.29,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
9998,0.609244,7.45,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [102]:
#nltk.download("stopwords")

#stopwords = set(stopwords.words('english'))
#words = [word for word in df_anime["Synopsis"] if not word in stopwords]
#df_anime["Synopsis"] = df_anime["Synopsis"].apply(lambda x: ' '.join([word for word in x.translate(str.maketrans('', '', string.punctuation)).lower().split() if word not in stopwords]))

In [103]:
#nltk.download('punkt')

#df_anime["Synopsis"] = df_anime["Synopsis"].astype("str")
#df_anime["Synopsis"] = df_anime["Synopsis"].apply(lambda x : word_tokenize(x))

In [104]:
#df_anime["Synopsis"] = df_anime["Synopsis"].apply(lambda x: ' '.join(x))

#vectorizer = CountVectorizer(stop_words='english')
#vectors = vectorizer.fit_transform(df_anime["Synopsis"])

In [105]:
#vocab = vectorizer.vocabulary_
#counts = vectors.sum(axis=0)

# Créer un DataFrame à partir du vocabulaire et des comptages
#df_vocab = pd.DataFrame({'mot': list(vocab.keys()), 'compte': counts.tolist()[0]})


In [106]:
#df_vocab = df_vocab.sort_values('compte', ascending=False).reset_index(drop=True)

In [107]:
#df_vocab