# Fetching and preparing data

In [260]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.tree import export_text

wanna_drop=['Romaji','Japanese','English Voice','Japanese Voice','Birth','Death','Manga Debut','Anime Debut','Web Novel Debut','Light Novel Debut','Members','Light Novel','Profession','Leader','Affiliations','Manga','Anime','Drama CD Voice','Web Novel','Pet']

df = pd.read_csv('all_characters_in_mushoku_tensei.csv').drop(wanna_drop,axis=1,inplace=False)

# character 'Nuckelgard' is repeated so dropped one of it 
df.drop(17, inplace=True)
df.drop(10,inplace=True)

y = df['Name']
X = df.drop('Name',axis=1,inplace=False)
X.head()

Unnamed: 0,Nickname,Epithet,Race,Gender,Age,Status,Relatives,Affiliation,Occupation,Rank,Hair Color,Family,Party,Eye Color,Height
0,Dragon King,Armoured Dragon King,Dragon Tribe,Male,400+,Alive,Crystal (Father)\nDora (Mother)\nLaplace (God ...,Asura Kingdom\nRudeus Greyrat\nNanahoshi Shizuka,Hero,Summoning God[1]\nBarrier God,,,,,
1,,,Immortal Demon Race (Hybrid),Male,Several Hundreds,Alive,,Atoferatofe Rybak,Guard Captain,,,,,,
2,,Sword Saint,Human,Male,,Alive,Jino Britts (Son),«Holy Land of Swords»\n«Sword God Style»,Swordsman,«Sword Emperor»,,,,,
3,,,Human,Male,K39X,Deceased,,«Asura Kingdom»,Guardian Knight,,,,,,
4,,,Beast Race,Male,,Alive,Gyes Dedoldia (Son)\nGhislaine Dedoldia (Daugh...,,Chief (Former),,,,,,


In [261]:
y.head()

0      Perugius Dola
1              Moore
2     Timothy Britts
3     Derrick Redbat
4    Gustav Dedoldia
Name: Name, dtype: object

# Values Below are meant for better understanding the data

In [262]:
df.shape

(137, 16)

Unique Values: Nickname 36,
Epithet 45,
Race 36,
Gender 3,
Age 36,
Status 16,
Relatives 74,
Affiliation 66,
Occupation 71,
Rank 40,
Light Novel Debut 34,
Web Novel Debut 28,
Anime Debut 28,
Japanese Voice 66,
English Voice 61,
Manga Debut 28,
Birth 38,
Death 16,
Hair Color 6,
Family 9,
Party 23,
Profession 2,
Leader 2,
Members 2,
Affiliations 2,
Light Novel 6,
Manga 6,
Anime 7,
Eye Color 4,
Web Novel 5,
Drama CD Voice 7,
Height 8,
Pet 3

NA Values:
Nickname 103,
Epithet 90,
Race 9,
Gender 3,
Age 97,
Status 16,
Relatives 65,
Affiliation 62,
Occupation 37,
Rank 81,
Light Novel Debut 50,
Web Novel Debut 62,
Anime Debut 75,
Japanese Voice 72,
English Voice 74,
Manga Debut 88,
Birth 71,
Death 96,
Hair Color 133,
Family 130,
Party 110,
Profession 137,
Leader 137,
Members 137,
Affiliations 137,
Light Novel 131,
Manga 132,
Anime 132,
Eye Color 134,
Web Novel 133,
Drama CD Voice 132,
Height 131,
Pet 136

Observational functions

In [263]:
def howManyInstancesOfSomeValue(X):
    for column in X:
        print(column)
        for value in X[column].unique():
            print("---------------------------------------")
            print(value,X[column].value_counts()[value])
        print('######################################################################')

In [264]:
X.fillna("Don't Know", inplace=True)
#howManyInstancesOfSomeValue(X)

Nickname
---------------------------------------
Dragon King 1
---------------------------------------
Don't Know 102
---------------------------------------
King's Greatshield 1
---------------------------------------
Head Ripping Prince
Master (by Julie) 1
---------------------------------------
Ray 1
---------------------------------------
Peacock Sword 1
---------------------------------------
Newbie (by Rudeus) 1
---------------------------------------
Sándor von Grandeur 1
---------------------------------------
Alek (by Atofe and Kishirika)
Left Hand of the Dragon God 1
---------------------------------------
Vio 1
---------------------------------------
Goddess (by Rudeus)[1]
Shishou (by Rudeus) 1
---------------------------------------
Rudy
The Owner Rujierd
The Owner of Dead End
Rudeus of the Quagmire
Shishou (by Zanoba)
Grand Master (by Julie)
Senpai (by Geese) 1
---------------------------------------
Rudi/Rudy[1]
The Owner Rujierd[2]
The Owner of Dead End
Kennel Master Rui

Tweaking the features Executing One Hot Encoding reason why not using libraries data is really not fit it needs some fine touch 

In [265]:
#One Hot Encoding 
#Nickname
X['have nickname']=(X['Nickname'] != "Don't Know")
X['have multiple nicknames'] = X['Nickname'].str.contains('\n')
X.drop('Nickname',axis=1,inplace=True)

#Epithet
X['is acknowledged by people (has some achievements)']=(X['Epithet'] != "Don't Know")
X.drop('Epithet',axis=1,inplace=True)

#Race
X['is a hybrid']=( (X['Race'].str.contains("Hybrid")) | (X['Race'].str.contains("Tribrid")) )
X['is a human']=(X['Race'].str.contains("Human"))
X['is a Beast']=(X['Race'].str.contains("Beast"))
X['is a Elf']=(X['Race'].str.contains("Elf"))
X['is a Demon']=(X['Race'].str.contains("Demon"))
X['is a Migurd']=(X['Race'].str.contains("Migur"))
X['is a Dwarf']=(X['Race'].str.contains("Dwarf"))
X['is a Superd']=(X['Race'].str.contains("Superd"))
X['is a Ogre']=(X['Race'].str.contains("Ogre"))
X['is a Hobbit']=(X['Race'].str.contains("Hobbit"))
X['is a Dragon']=(X['Race'].str.contains("Dragon"))
X['is a Horse']=(X['Race'].str.contains("Horse"))
X['is a Mildett']=(X['Race'].str.contains("Mildett"))
X['is a Spirit Familiar']=(X['Race'].str.contains("Spirit Familiar"))
X['is a Demigod']=(X['Race'].str.contains("Demi"))
X['is a Heaven Race']=(X['Race'].str.contains("Heaven Race"))
X.drop('Race',axis=1,inplace=True)

#Gender
X['is female']=(X['Gender']=='Female')
X['unknown or does not have gender']=(X['Gender']=="Don't Know")
X.drop('Gender',axis=1,inplace=True)

#Age
X['age is unknown']=( (X['Age'].str.contains('Unknown')) | (X['Age'].str.contains("Don't Know")) )
X['has lived more than 200 years']=( (X['Age'].str.contains('400+')) | ((X['Age'] == "+12000 years")) | (X['Age'].str.contains("566 (Volume 3)")) | (X['Age'].str.contains("32,890+")) | (X['Age'].str.contains("22000+")) | (X['Age'].str.contains("~4200 - 10000")) | (X['Age'].str.contains("7000")))
X['has lived more than 200 years']= (X['Age'].str.contains("Several Hundreds") | (X['Age'].str.contains("K39X")) | (X['Age'].str.contains("Immortal")) | (X['Age'].str.contains("200+")))
X.drop('Age',axis=1,inplace=True)

#Status
X['is Dead or sealed']=( X['Status'].str.contains("Alive") != True)
X.drop('Status',axis=1,inplace=True)

#Relatives
X['has no relatives or relatives are unknown'] =( (X['Relatives']=="Don't Know") & ( X['Family']=="Don't Know" ) )
X['is related to Greyrats and (Latreia from Zeniths side)']=( (X['Relatives'].str.contains("Latreia")) | (X['Relatives'].str.contains("Greyrats")) | (X['Family'].str.contains("Greyrats"))|(X['Family'].str.contains("Latreia")))
X['is related to Dedoldias (Beastman)']=((X['Relatives'].str.contains("Dedoldia"))|(X['Family'].str.contains("Dedoldia")))
X['is related to Zaonoba']=((X['Relatives'].str.contains("Shirone"))|(X['Family'].str.contains("Shirone")))
X.drop('Relatives',axis=1,inplace=True)

#Affiliation
X['is affiliated with Asura Kingdom']=(X['Affiliation'].str.contains("Asura Kingdom"))
X['is affiliated with Rudeus Greyrat']=(X['Affiliation'].str.contains("Rudeus Greyrat"))
X['is affiliated with Ranoa Magic Academy']=(X['Affiliation'].str.contains("Ranoa Magic Academy"))
X['is affiliated with Ariel Anemoi Asura']=(X['Affiliation'].str.contains("Ariel Anemoi Asura"))
X['is of Unknown affiliation']=(X['Affiliation']=="Don't Know")
X.drop('Affiliation',axis=1,inplace=True)

#Occupation
X['occupation is unknown']=(X['Occupation'].str.contains("Don't Know"))
X['has been a student']=(X['Occupation'].str.contains("Student"))
X['is a Mercenary']=(X['Occupation'].str.contains("Mercenary"))
X['is an Adventurer']=(X['Occupation'].str.contains("Adventurer"))
X['was or is a slave']=(X['Occupation'].str.contains("lave"))
X['is a Knight']=(X['Occupation'].str.contains("Knight"))
X['is a God']=(X['Occupation'].str.contains("God"))
X['is a Swordsman/Swordswoman']=( (X['Occupation'].str.contains("Swordsman")) | (X['Occupation'].str.contains("Swordswoman")) )
X.drop('Occupation',axis=1,inplace=True)

#Rank
X['Rank']=X['Rank'].str.replace("«Adventure Guild»","Adventure Guild")
X["'s rank is unknown ex:'B «Adventure Guild»'"]=(X['Rank'].str.contains("Don't Know"))
X["is A rank in Adventure Guild"]=( (X['Rank'].str.contains("A Adventure Guild")) | (X['Rank']=='A') )
X["is B rank in Adventure Guild"]=( (X['Rank'].str.contains("B Adventure Guild")) | (X['Rank']=='B'))
X["is C rank in Adventure Guild"]=( (X['Rank'].str.contains("C Adventure Guild")) | (X['Rank']=='C'))
X["is D rank in Adventure Guild"]=( (X['Rank'].str.contains("D Adventure Guild")) | (X['Rank']=='D'))
X["is S rank in Adventure Guild"]=( (X['Rank'].str.contains("S Adventure Guild")) | (X['Rank']=='S'))
X["is a 'god' level"]=(X['Rank'].str.contains("God"))
X["is an 'Emperor' level"]=(X['Rank'].str.contains("Emp"))
X.drop('Rank',axis=1,inplace=True)

#Party
X['is a member of some party'] = (X['Party'].str.contains("Don't Know") != True)
X.drop('Party',axis=1,inplace=True)
#Hair Color
X['s hair color is Blue']=(X['Hair Color']=='Blue')
X['s hair color is Blonde']=(X['Hair Color']=='Blonde')
X['s hair color is Light Brown']=(X['Hair Color']=='Light Brown')
X['s hair color is Silver']=(X['Hair Color']=='Silver')
X['s hair color is Red']=(X['Hair Color']=='Red')
X.drop('Hair Color',axis=1,inplace=True)

#Eye Color
#X['s eye color is Blue']=(X['Eye Color']=='Blue')
#X['s eye color is Green']=(X['Eye Color']=='Green')
#X['s eye color is Red']=(X['Eye Color']=='Red')
X.drop(['Eye Color','Height','Family'],axis=1,inplace=True)

#Just merging it for later visualization purposes
df=pd.merge(X,y,left_index=True, right_index=True)

df.head()
#howManyInstancesOfSomeValue(X)

  X['has lived more than 200 years']=( (X['Age'].str.contains('400+')) | ((X['Age'] == "+12000 years")) | (X['Age'].str.contains("566 (Volume 3)")) | (X['Age'].str.contains("32,890+")) | (X['Age'].str.contains("22000+")) | (X['Age'].str.contains("~4200 - 10000")) | (X['Age'].str.contains("7000")))


Unnamed: 0,have nickname,have multiple nicknames,is acknowledged by people (has some achievements),is a hybrid,is a human,is a Beast,is a Elf,is a Demon,is a Migurd,is a Dwarf,...,is S rank in Adventure Guild,is a 'god' level,is an 'Emperor' level,is a member of some party,s hair color is Blue,s hair color is Blonde,s hair color is Light Brown,s hair color is Silver,s hair color is Red,Name
0,True,False,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,Perugius Dola
1,False,False,False,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,Moore
2,False,False,True,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,Timothy Britts
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Derrick Redbat
4,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,Gustav Dedoldia


now that we have created tons of categorical features lets check them out what kind of attributes do they have

# Observing the data looking for ways to improve it and so on ...

Looking at corolations

In [266]:
#y=df['Name'].map(mapper)

## Pearson correlation
#pearson_corr = df.corr(method='pearson')
#plt.figure( figsize=(32,24) )
#sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', fmt=".2f")

For Encoding question arises why did 

In [267]:
for column in X:
    X[column]=(X[column].astype(int)*2-1)

# It Creates a CSV for later use that file will constantly change depending on inputs if it guesses rigth new row will be added not sure if it will improve performance
pd.merge(X,y,left_index=True, right_index=True)#.to_csv('all_the_chars_base.csv', index=False)

X

Unnamed: 0,have nickname,have multiple nicknames,is acknowledged by people (has some achievements),is a hybrid,is a human,is a Beast,is a Elf,is a Demon,is a Migurd,is a Dwarf,...,is D rank in Adventure Guild,is S rank in Adventure Guild,is a 'god' level,is an 'Emperor' level,is a member of some party,s hair color is Blue,s hair color is Blonde,s hair color is Light Brown,s hair color is Silver,s hair color is Red
0,1,-1,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,-1,-1,-1,-1,-1,-1
1,-1,-1,-1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,-1,-1,1,-1,1,-1,-1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,-1
3,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,-1,-1,-1,-1,-1,1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,-1,-1,-1,-1,1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,-1,-1,-1,-1
135,-1,-1,-1,1,1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
136,1,1,1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,1,-1,-1,-1,-1,-1
137,1,-1,-1,1,1,1,1,-1,-1,-1,...,-1,-1,1,1,-1,-1,-1,-1,-1,-1


# Model (from sckit)

In [268]:
df = pd.read_csv('all_the_chars_base.csv')
X = df.drop('Name',axis=1,inplace=False)
y = df['Name']
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Train the Decision Tree Classifier I wouldn't call it training just creating a tree  
clf.fit(X, y)

# Predict on the test set
predictions = clf.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

#treeVisuals(clf)


Accuracy: 0.9310344827586207


Tree Visualization

In [269]:
def treeVisuals(clf):
    feature_importance = clf.feature_importances_
    # Define the feature names based on your dataset
    print(feature_importance)
    plt.figure(figsize=(100, 90))
    plot_tree(clf, filled=True, feature_names=X.columns)
    plt.show()

Defining priority columns thanks to which questioning order will be defined

In [270]:
order = []

for column in X:
    count_1 = len(X[X[column] == 1])
    count_0 = len(X[X[column] == -1])
    
    columnCharacteristics = {
        'column' : column,
        'difference' : abs( count_0 - count_1 )
    }
    order.append(columnCharacteristics)
order = sorted(order, key=lambda x: x['difference'])

Questioning algorithm 

In [271]:
print('Hello welcome to Akinator style game only characters from Mishoku Tensei')
while True:
    if  input('Do you want to proceed with the game (yes/no)?').lower()== 'no':
        print('Thanks for playing :)')
        break
    
    charVector = pd.DataFrame(columns=X.columns)
    charVector.loc[0] = [0]*55
    
    questionNumber = 0
    howManyQuestionsShouldAskBeforeTryingToPredict = 10
    totalQuestionsAsked=0
    response_to_character_prediction='no'
    
    for question in order:
        questionNumber=questionNumber+1
        totalQuestionsAsked=totalQuestionsAsked+1       
        response = input("Your character "+question['column']+" yes/no/donno").lower()
        
        if "yes" in response:
            charVector[question['column']]=1
        elif "no" in response:
            charVector[question['column']]=-1
        else :
            charVector[question['column']]=0
        
        if ( howManyQuestionsShouldAskBeforeTryingToPredict==questionNumber ) or ( totalQuestionsAsked == 55 ) :
            prediction = clf.predict(charVector)
            response_to_character_prediction = input("is this your character ? (yes/no):"+prediction[0]).lower()
            
            if response_to_character_prediction=='yes':
                print('Alright that is it :)')
            else:
                questionNumber=0
        
        if (response_to_character_prediction =='yes'):
            charVector['Name']=prediction[0]
            df = pd.concat([df, charVector], ignore_index=True)
            df.to_csv('all_the_chars_base.csv', index=False)
            break
        if (totalQuestionsAsked == 55):
            break

Hello welcome to Akinator style game only characters from Mishoku Tensei


KeyboardInterrupt: Interrupted by user