In [221]:
import pandas as pd
import numpy as np

I have five .csv files with words and its targets (complexity level) that I need to merge together in one file. But here is the catch: 
1. 2nd column (after the 'words' column) was generated by the model that was made by other person, and the 3rd column is fixes made by philologists. 
    * 'u' is one level higher
    * 'd' is one level lower
    * '0' is correct prediction
    * '*level*' is change of the level when it is different by more than one level
    I need to keep only assignments of philologists, as they have an authority in grading a complexity of Kazakh words (since Kazakh language is a resource language). So, I made an algorithm that will convert above values to an actual levels.
2. files don't have a columns and, hence, when converting to DataFrame format, first row is converted to be a columns' names.
3. same words might appear in several files while having some no equlities in levels and, hence, should merged while keeping a mind a hierarchical priority of philologists (e.g. Eldana has higher priority than Amirlan and it is more likely that her assignment of levels are correct)


In [222]:
df1 = pd.read_csv('data/levels_eldana_new.csv')

In [223]:
df1

Unnamed: 0,буыну,B1,0
0,прагматикалық,B1,B2
1,қарастырылу,B1,0
2,археологиялық,B2,0
3,думан,B2,d
4,қошеметтеу,A2,u
...,...,...,...
894,ұйымдық,B1,
895,тебірену,B2,
896,минерал,B2,
897,отырысу,B1,


In [224]:
first_row = pd.DataFrame([df1.columns], columns=df1.columns)
df1 = pd.concat([first_row, df1], ignore_index=True)
df1.columns = np.arange(0, df1.shape[1])

In [225]:
df1

Unnamed: 0,0,1,2
0,буыну,B1,0
1,прагматикалық,B1,B2
2,қарастырылу,B1,0
3,археологиялық,B2,0
4,думан,B2,d
...,...,...,...
895,ұйымдық,B1,
896,тебірену,B2,
897,минерал,B2,
898,отырысу,B1,


In [226]:
df2 = pd.read_csv('data/levels_eldana_c.csv')
first_row = pd.DataFrame([df2.columns], columns=df2.columns)
df2 = pd.concat([first_row, df2], ignore_index=True)
df2.columns = np.arange(0, df2.shape[1])
df2

Unnamed: 0,0,1,2
0,оймақ,C1,0
1,олқылық,C1,0
2,параллелизм,C1,0
3,резюме,C1,0
4,рецензия,C1,0
5,рулы,C1,0
6,соматикалық,C1,0
7,сұраным,B2,0
8,төлнұсқа,B2,0
9,тіфә,C1,0


In [227]:
df = pd.concat([df1, df2], ignore_index=True)

**df** will be main dataset

In [228]:
df

Unnamed: 0,0,1,2
0,буыну,B1,0
1,прагматикалық,B1,B2
2,қарастырылу,B1,0
3,археологиялық,B2,0
4,думан,B2,d
...,...,...,...
915,феодал,C1,0
916,сырбаз,C1,0
917,сұқсыр,C1,0
918,спектрограмма,C1,0


In [229]:
df[0].duplicated().sum()

0

In [230]:
df3 = pd.read_csv('data/levels_eldana.csv')

In [231]:
first_row = pd.DataFrame([df3.columns], columns=df3.columns)
df3 = pd.concat([first_row, df3], ignore_index=True)
df3.columns = np.arange(0, df3.shape[1])
df3

Unnamed: 0,0,1,2
0,раушан,B1,0
1,тер,B1,d
2,терлі,B2,A2
3,мүйіз,B1,0
4,тұяқ,A2,0
...,...,...,...
295,бақа,A2,0
296,құрбақа,A2,0
297,жарату,A1,B1
298,тұтыну,B1,0


In [232]:
df = pd.concat([df, df3], ignore_index=True)
df

Unnamed: 0,0,1,2
0,буыну,B1,0
1,прагматикалық,B1,B2
2,қарастырылу,B1,0
3,археологиялық,B2,0
4,думан,B2,d
...,...,...,...
1215,бақа,A2,0
1216,құрбақа,A2,0
1217,жарату,A1,B1
1218,тұтыну,B1,0


In [233]:
df.isna().sum()

0      1
1      1
2    491
dtype: int64

In [234]:
df.dropna(inplace=True)

In [235]:
df.isna().sum()

0    0
1    0
2    0
dtype: int64

In [236]:
df[0].duplicated().sum()

9

In [237]:
df.drop(df[df[0].duplicated()].index, inplace=True)

In [238]:
df[0].duplicated().sum()

0

In [239]:
df4 = pd.read_csv('data/levels_eldana_a.csv')

In [240]:
first_row = pd.DataFrame([df4.columns], columns=df4.columns)
df4 = pd.concat([first_row, df4], ignore_index= True)
df4.columns = np.arange(0, df4.shape[1])

In [241]:
df = pd.concat([df, df4], ignore_index=True)
df

Unnamed: 0,0,1,2
0,буыну,B1,0
1,прагматикалық,B1,B2
2,қарастырылу,B1,0
3,археологиялық,B2,0
4,думан,B2,d
...,...,...,...
897,жол,A1,0
898,салу,A1,0
899,беру,A1,0
900,сол,A1,0


In [242]:
df.isna().sum()

0    0
1    0
2    0
dtype: int64

In [243]:
df[0].duplicated().sum()

1

In [244]:
df.drop(df[df[0].duplicated()].index, inplace=True)

In [245]:
df[0].duplicated().sum()

0

In [246]:
df5 = pd.read_csv('data/levels_amirlan.csv')

In [247]:
first_row = pd.DataFrame([df5.columns], columns=df5.columns)
df5 = pd.concat([first_row, df5], ignore_index=True)
df5.columns = np.arange(0, df5.shape[1])

In [248]:
df = pd.concat([df, df5], ignore_index=True)

In [249]:
df.isna().sum()

0    0
1    0
2    0
dtype: int64

In [250]:
df[0].duplicated().sum()

350

I was told that files that has name "Eldana" has a highest priority, thus I will drop "last"  duplicates (since I concat to the botton of df)

In [251]:
df.drop(df[df[0].duplicated(keep='first')].index, inplace=True)

In [252]:
df[0].duplicated().sum()

0

In [253]:
df.columns = ['word', 'score_of_prev_model', 'fix']

In [254]:
df

Unnamed: 0,word,score_of_prev_model,fix
0,буыну,B1,0
1,прагматикалық,B1,B2
2,қарастырылу,B1,0
3,археологиялық,B2,0
4,думан,B2,d
...,...,...,...
1521,алды (алд),A2,0
1522,алдында,A1,0
1523,үміт,A2,0
1526,құндылық,A1,0


In [255]:
df.isna().sum()

word                   0
score_of_prev_model    0
fix                    0
dtype: int64

In [256]:
df.drop(df.loc[(df.fix == 'я даже не знаю что это такое')].index, inplace=True)

In [257]:
df.drop(df.loc[(df.fix == 'по идеи нет')].index, inplace=True)

In [258]:
df.loc[(df.fix == 'A1-A2')]

Unnamed: 0,word,score_of_prev_model,fix
1486,астана,B2,A1-A2


In [259]:
df.replace('A1-A2', 'A1', inplace=True)

In [260]:
df.loc[(df.fix == 'A1-A2')]

Unnamed: 0,word,score_of_prev_model,fix


**dfft** is a tabel of features of words, which show counts of occurences of them in different types of corpuses (e.g. literature, newspaper)

In [261]:
dfft = pd.read_csv('data/dictionary.csv')
dfft

Unnamed: 0,1,5167,бол/ет,28764,4604,9506,9549,7830,60253,47880,108133
0,2,22195,ол/ес,14143,5271,1711,7039,4358,32522,30274,62796
1,3,20374,мен/шл,18249,4500,4650,4450,3146,34995,23411,58406
2,4,7612,де/ет,16392,329,5973,4498,2480,29672,16528,46200
3,5,1046,ал/ет,13294,3507,4382,2726,4231,28140,15219,43359
4,6,4804,бер/ет,9156,2679,3761,5808,2569,23973,16322,40295
...,...,...,...,...,...,...,...,...,...,...,...
36239,-,-,қайдан/еc,-,-,-,-,-,-,-,0
36240,-,-,бассейн/зт,-,-,-,-,-,-,-,0
36241,-,-,Азия/зт,-,-,-,-,-,-,-,0
36242,-,-,мульфильм/зт,-,-,-,-,-,-,-,0


again, same issues with columns

In [262]:
first_row = pd.DataFrame([dfft.columns], columns = dfft.columns)
dfft = pd.concat([first_row, dfft], ignore_index=True)
dfft.columns = np.arange(0, dfft.shape[1])

In [263]:
dfft.drop([0, 1], axis=1, inplace=True)

In [264]:
dfft

Unnamed: 0,2,3,4,5,6,7,8,9,10
0,бол/ет,28764,4604,9506,9549,7830,60253,47880,108133
1,ол/ес,14143,5271,1711,7039,4358,32522,30274,62796
2,мен/шл,18249,4500,4650,4450,3146,34995,23411,58406
3,де/ет,16392,329,5973,4498,2480,29672,16528,46200
4,ал/ет,13294,3507,4382,2726,4231,28140,15219,43359
...,...,...,...,...,...,...,...,...,...
36240,қайдан/еc,-,-,-,-,-,-,-,0
36241,бассейн/зт,-,-,-,-,-,-,-,0
36242,Азия/зт,-,-,-,-,-,-,-,0
36243,мульфильм/зт,-,-,-,-,-,-,-,0


In [265]:
words = []
types_of_words = []
for i, row, in dfft.iterrows():
    words.append(row[2].split('/')[0])
    types_of_words.append(row[2].split('/')[1])
dfft['word'] = words
dfft['type_of_word'] = types_of_words

In [266]:
dfft.drop([2], axis=1, inplace=True)

In [267]:
dfft

Unnamed: 0,3,4,5,6,7,8,9,10,word,type_of_word
0,28764,4604,9506,9549,7830,60253,47880,108133,бол,ет
1,14143,5271,1711,7039,4358,32522,30274,62796,ол,ес
2,18249,4500,4650,4450,3146,34995,23411,58406,мен,шл
3,16392,329,5973,4498,2480,29672,16528,46200,де,ет
4,13294,3507,4382,2726,4231,28140,15219,43359,ал,ет
...,...,...,...,...,...,...,...,...,...,...
36240,-,-,-,-,-,-,-,0,қайдан,еc
36241,-,-,-,-,-,-,-,0,бассейн,зт
36242,-,-,-,-,-,-,-,0,Азия,зт
36243,-,-,-,-,-,-,-,0,мульфильм,зт


In [268]:
dfft = dfft.loc[:, ['word', 'type_of_word', 3, 4, 5, 6, 7, 8, 9, 10]]

In [269]:
dfft.drop(8, axis=1, inplace=True)

In [270]:
dfft.drop(10, axis=1, inplace=True)

In [271]:
dfft

Unnamed: 0,word,type_of_word,3,4,5,6,7,9
0,бол,ет,28764,4604,9506,9549,7830,47880
1,ол,ес,14143,5271,1711,7039,4358,30274
2,мен,шл,18249,4500,4650,4450,3146,23411
3,де,ет,16392,329,5973,4498,2480,16528
4,ал,ет,13294,3507,4382,2726,4231,15219
...,...,...,...,...,...,...,...,...
36240,қайдан,еc,-,-,-,-,-,-
36241,бассейн,зт,-,-,-,-,-,-
36242,Азия,зт,-,-,-,-,-,-
36243,мульфильм,зт,-,-,-,-,-,-


In [272]:

dfft.columns = ['word', 'type_of_word', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6']
dfft

Unnamed: 0,word,type_of_word,f1,f2,f3,f4,f5,f6
0,бол,ет,28764,4604,9506,9549,7830,47880
1,ол,ес,14143,5271,1711,7039,4358,30274
2,мен,шл,18249,4500,4650,4450,3146,23411
3,де,ет,16392,329,5973,4498,2480,16528
4,ал,ет,13294,3507,4382,2726,4231,15219
...,...,...,...,...,...,...,...,...
36240,қайдан,еc,-,-,-,-,-,-
36241,бассейн,зт,-,-,-,-,-,-
36242,Азия,зт,-,-,-,-,-,-
36243,мульфильм,зт,-,-,-,-,-,-


In [273]:
dfft.replace('-', 0, inplace=True)

In [274]:
for i, row in dfft.iterrows():
    if (int(row.f1)+int(row.f2)+int(row.f3)+int(row.f4)+int(row.f5)+int(row.f6)) < 2:
        dfft.drop(i, inplace=True)

I am dropping rare and strange samples

In [275]:
df

Unnamed: 0,word,score_of_prev_model,fix
0,буыну,B1,0
1,прагматикалық,B1,B2
2,қарастырылу,B1,0
3,археологиялық,B2,0
4,думан,B2,d
...,...,...,...
1521,алды (алд),A2,0
1522,алдында,A1,0
1523,үміт,A2,0
1526,құндылық,A1,0


In [276]:
dfft

Unnamed: 0,word,type_of_word,f1,f2,f3,f4,f5,f6
0,бол,ет,28764,4604,9506,9549,7830,47880
1,ол,ес,14143,5271,1711,7039,4358,30274
2,мен,шл,18249,4500,4650,4450,3146,23411
3,де,ет,16392,329,5973,4498,2480,16528
4,ал,ет,13294,3507,4382,2726,4231,15219
...,...,...,...,...,...,...,...,...
28031,этносаясаттанушы,зт,0,0,0,0,0,2
28032,этнотілдік,сн,0,0,0,0,0,2
28033,этноұжымдық,сн,0,0,0,0,0,2
28034,этноұлттық,сн,1,0,0,0,0,1


In [277]:
fdf = pd.merge(df, dfft)

In [278]:
fdf

Unnamed: 0,word,score_of_prev_model,fix,type_of_word,f1,f2,f3,f4,f5,f6
0,прагматикалық,B1,B2,сн,0,0,0,8,0,116
1,археологиялық,B2,0,сн,103,10,0,3,0,28
2,думан,B2,d,зт,100,0,6,10,7,14
3,графика,B2,0,зт,89,1,0,13,1,54
4,небәрі,B2,d,үс,27,1,5,21,7,79
...,...,...,...,...,...,...,...,...,...,...
1038,жұмыскер,C1,A2,зт,11,10,10,4,2,27
1039,алды (алд),A2,0,зт,175,7,337,18,29,0
1040,үміт,A2,0,зт,365,9,96,100,76,533
1041,құндылық,A1,0,зт,209,134,0,150,0,589


now, when I merged targets with features of words, I need to encode scores

In [279]:
from sklearn.preprocessing import OrdinalEncoder

categories = [['A1', 'A2', 'B1', 'B2', 'C1']]
encoder = OrdinalEncoder(categories=categories)
fdf["score_encoded"] = encoder.fit_transform(fdf[["score_of_prev_model"]])

In [280]:
fdf

Unnamed: 0,word,score_of_prev_model,fix,type_of_word,f1,f2,f3,f4,f5,f6,score_encoded
0,прагматикалық,B1,B2,сн,0,0,0,8,0,116,2.0
1,археологиялық,B2,0,сн,103,10,0,3,0,28,3.0
2,думан,B2,d,зт,100,0,6,10,7,14,3.0
3,графика,B2,0,зт,89,1,0,13,1,54,3.0
4,небәрі,B2,d,үс,27,1,5,21,7,79,3.0
...,...,...,...,...,...,...,...,...,...,...,...
1038,жұмыскер,C1,A2,зт,11,10,10,4,2,27,4.0
1039,алды (алд),A2,0,зт,175,7,337,18,29,0,1.0
1040,үміт,A2,0,зт,365,9,96,100,76,533,1.0
1041,құндылық,A1,0,зт,209,134,0,150,0,589,0.0


In [281]:
fdf.score_of_prev_model.value_counts()

score_of_prev_model
B1    282
A1    279
B2    261
A2    168
C1     53
Name: count, dtype: int64

Here is an algorithm to convert *fix* column to an actual target values of words

In [282]:
for i, row in fdf.iterrows():
    if (fdf.fix[i] == 'u') or (fdf.fix[i] == 'u/\\?'):
        fdf.loc[i, "score_encoded"] += 1
    elif (fdf.fix[i] == 'd') or (fdf.fix[i] == 'd?') :
        fdf.loc[i, "score_encoded"] -= 1


In [283]:
for i, row in fdf.iterrows():
    if fdf.fix[i] in categories[0]:
        
        fdf.loc[i, "score_encoded"] = encoder.transform([[fdf.fix[i]]])[0, 0]
        print(f"Row {i}: Encoded '{fdf.fix[i]}' as {fdf.loc[i, 'score_encoded']}")
    else:
        print(f"Row {i}: Skipped value '{fdf.fix[i]}' as it's not in categories")

Row 0: Encoded 'B2' as 3.0
Row 1: Skipped value '0' as it's not in categories
Row 2: Skipped value 'd' as it's not in categories
Row 3: Skipped value '0' as it's not in categories
Row 4: Skipped value 'd' as it's not in categories
Row 5: Skipped value '0' as it's not in categories
Row 6: Skipped value 'u' as it's not in categories
Row 7: Skipped value '0' as it's not in categories
Row 8: Skipped value 'd' as it's not in categories
Row 9: Skipped value '0' as it's not in categories
Row 10: Skipped value 'd' as it's not in categories
Row 11: Encoded 'A2' as 1.0
Row 12: Skipped value '0' as it's not in categories
Row 13: Skipped value '0' as it's not in categories
Row 14: Skipped value 'u' as it's not in categories
Row 15: Skipped value '0' as it's not in categories
Row 16: Skipped value 'u' as it's not in categories
Row 17: Encoded 'A2' as 1.0
Row 18: Skipped value 'd' as it's not in categories
Row 19: Skipped value 'd' as it's not in categories
Row 20: Skipped value 'd' as it's not in c



In [284]:
fdf

Unnamed: 0,word,score_of_prev_model,fix,type_of_word,f1,f2,f3,f4,f5,f6,score_encoded
0,прагматикалық,B1,B2,сн,0,0,0,8,0,116,3.0
1,археологиялық,B2,0,сн,103,10,0,3,0,28,3.0
2,думан,B2,d,зт,100,0,6,10,7,14,2.0
3,графика,B2,0,зт,89,1,0,13,1,54,3.0
4,небәрі,B2,d,үс,27,1,5,21,7,79,2.0
...,...,...,...,...,...,...,...,...,...,...,...
1038,жұмыскер,C1,A2,зт,11,10,10,4,2,27,1.0
1039,алды (алд),A2,0,зт,175,7,337,18,29,0,1.0
1040,үміт,A2,0,зт,365,9,96,100,76,533,1.0
1041,құндылық,A1,0,зт,209,134,0,150,0,589,0.0


In [285]:
fdf

Unnamed: 0,word,score_of_prev_model,fix,type_of_word,f1,f2,f3,f4,f5,f6,score_encoded
0,прагматикалық,B1,B2,сн,0,0,0,8,0,116,3.0
1,археологиялық,B2,0,сн,103,10,0,3,0,28,3.0
2,думан,B2,d,зт,100,0,6,10,7,14,2.0
3,графика,B2,0,зт,89,1,0,13,1,54,3.0
4,небәрі,B2,d,үс,27,1,5,21,7,79,2.0
...,...,...,...,...,...,...,...,...,...,...,...
1038,жұмыскер,C1,A2,зт,11,10,10,4,2,27,1.0
1039,алды (алд),A2,0,зт,175,7,337,18,29,0,1.0
1040,үміт,A2,0,зт,365,9,96,100,76,533,1.0
1041,құндылық,A1,0,зт,209,134,0,150,0,589,0.0


In [286]:
number_of_letters = []
for i, row in fdf.iterrows():
    number_of_letters.append(len(row.word))

In [287]:
fdf['number_of_letters'] = number_of_letters

here, I created a new feature that is basically how chatgpt assigns complexity level of words

In [288]:
chatgpt_words = pd.read_csv('data/word_complexity_chatgpt.csv')

In [289]:
chatgpt_words

Unnamed: 0.1,Unnamed: 0,word,complexity_level
0,0,прагматикалық,C1
1,1,археологиялық,C1
2,2,думан,B1
3,3,графика,B1
4,4,небәрі,B1
...,...,...,...
1038,1038,жұмыскер,B2
1039,1039,алды (алд),B2
1040,1040,үміт,A1
1041,1041,құндылық,B2


In [290]:
for i, row in fdf.iterrows():
    if chatgpt_words.complexity_level[i] in categories[0]:
        fdf.loc[i, "chatgpt_score_encoded"] = encoder.transform([[chatgpt_words['complexity_level'][i]]])[0, 0]
        print(f"Row {i}: Encoded '{fdf.fix[i]}' as {fdf.loc[i, 'score_encoded']}")
    else:
        print(f"Row {i}: Skipped value '{fdf.fix[i]}' as it's not in categories")



Row 0: Encoded 'B2' as 3.0
Row 1: Encoded '0' as 3.0
Row 2: Encoded 'd' as 2.0
Row 3: Encoded '0' as 3.0
Row 4: Encoded 'd' as 2.0
Row 5: Encoded '0' as 2.0
Row 6: Encoded 'u' as 2.0
Row 7: Encoded '0' as 3.0
Row 8: Encoded 'd' as 2.0
Row 9: Encoded '0' as 3.0
Row 10: Encoded 'd' as 2.0
Row 11: Encoded 'A2' as 1.0
Row 12: Encoded '0' as 3.0
Row 13: Encoded '0' as 3.0
Row 14: Encoded 'u' as 3.0
Row 15: Encoded '0' as 3.0
Row 16: Encoded 'u' as 3.0
Row 17: Encoded 'A2' as 1.0
Row 18: Encoded 'd' as 2.0
Row 19: Encoded 'd' as 2.0
Row 20: Encoded 'd' as 2.0
Row 21: Encoded 'A1' as 0.0
Row 22: Encoded 'd' as 1.0
Row 23: Encoded 'd' as 1.0
Row 24: Encoded 'd' as 2.0
Row 25: Encoded 'u' as 3.0
Row 26: Encoded 'd' as 2.0
Row 27: Encoded 'u' as 3.0
Row 28: Encoded '0' as 3.0
Row 29: Encoded 'A2' as 1.0
Row 30: Encoded '0' as 2.0
Row 31: Encoded 'u' as 4.0
Row 32: Encoded '0' as 3.0
Row 33: Encoded '0' as 3.0
Row 34: Encoded '0' as 3.0
Row 35: Encoded 'u' as 3.0
Row 36: Encoded 'u' as 3.0
Row 37



Row 506: Encoded '0' as 0.0
Row 507: Encoded '0' as 0.0
Row 508: Encoded '0' as 1.0
Row 509: Encoded '0' as 2.0
Row 510: Encoded '0' as 3.0
Row 511: Encoded '0' as 3.0
Row 512: Encoded '0' as 3.0
Row 513: Encoded '0' as 0.0
Row 514: Encoded '0' as 0.0
Row 515: Encoded '0' as 0.0
Row 516: Encoded '0' as 0.0
Row 517: Encoded '0' as 0.0
Row 518: Encoded '0' as 0.0
Row 519: Encoded '0' as 0.0
Row 520: Encoded '0' as 0.0
Row 521: Encoded '0' as 0.0
Row 522: Encoded '0' as 0.0
Row 523: Encoded '0' as 2.0
Row 524: Encoded '0' as 2.0
Row 525: Encoded '0' as 0.0
Row 526: Encoded '0' as 0.0
Row 527: Encoded '0' as 2.0
Row 528: Encoded '0' as 3.0
Row 529: Encoded '0' as 0.0
Row 530: Encoded '0' as 0.0
Row 531: Encoded '0' as 1.0
Row 532: Encoded '0' as 0.0
Row 533: Encoded '0' as 2.0
Row 534: Encoded '0' as 1.0
Row 535: Encoded '0' as 0.0
Row 536: Encoded '0' as 0.0
Row 537: Encoded '0' as 1.0
Row 538: Encoded '0' as 1.0
Row 539: Encoded '0' as 0.0
Row 540: Encoded '0' as 1.0
Row 541: Encoded '0'

Row 927: Encoded '0' as 0.0
Row 928: Encoded 'u' as 1.0
Row 929: Encoded 'u' as 1.0
Row 930: Encoded '0' as 0.0
Row 931: Encoded 'B1' as 2.0
Row 932: Encoded '0' as 0.0
Row 933: Encoded 'A1' as 0.0
Row 934: Encoded 'A1' as 0.0
Row 935: Encoded 'u' as 1.0
Row 936: Encoded '0' as 0.0
Row 937: Encoded '0' as 0.0
Row 938: Encoded '0' as 0.0
Row 939: Encoded '0' as 0.0
Row 940: Encoded '0' as 0.0
Row 941: Encoded '0' as 1.0
Row 942: Encoded '0' as 3.0
Row 943: Encoded '0' as 3.0
Row 944: Encoded 'u' as 2.0
Row 945: Encoded '0' as 1.0
Row 946: Encoded '0' as 0.0
Row 947: Encoded '0' as 0.0
Row 948: Encoded '0' as 0.0
Row 949: Encoded '0' as 0.0
Row 950: Encoded '0' as 0.0
Row 951: Encoded 'B1' as 2.0
Row 952: Encoded '0' as 0.0
Row 953: Encoded '0' as 0.0
Row 954: Encoded '0' as 0.0
Row 955: Encoded '0' as 0.0
Row 956: Encoded '0' as 0.0
Row 957: Encoded '0' as 0.0
Row 958: Encoded '0' as 0.0
Row 959: Encoded '0' as 0.0
Row 960: Encoded '0' as 0.0
Row 961: Encoded 'B1' as 2.0
Row 962: Encode



In [291]:
fdf.drop(labels=['score_of_prev_model', 'fix', 'type_of_word'], axis=1, inplace=True)

In [292]:
fdf

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,score_encoded,number_of_letters,chatgpt_score_encoded
0,прагматикалық,0,0,0,8,0,116,3.0,13,4.0
1,археологиялық,103,10,0,3,0,28,3.0,13,4.0
2,думан,100,0,6,10,7,14,2.0,5,2.0
3,графика,89,1,0,13,1,54,3.0,7,2.0
4,небәрі,27,1,5,21,7,79,2.0,6,2.0
...,...,...,...,...,...,...,...,...,...,...
1038,жұмыскер,11,10,10,4,2,27,1.0,8,3.0
1039,алды (алд),175,7,337,18,29,0,1.0,10,3.0
1040,үміт,365,9,96,100,76,533,1.0,4,0.0
1041,құндылық,209,134,0,150,0,589,0.0,8,3.0


In [293]:
fdf = fdf.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8, 9, 7]]

In [294]:
fdf

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,number_of_letters,chatgpt_score_encoded,score_encoded
0,прагматикалық,0,0,0,8,0,116,13,4.0,3.0
1,археологиялық,103,10,0,3,0,28,13,4.0,3.0
2,думан,100,0,6,10,7,14,5,2.0,2.0
3,графика,89,1,0,13,1,54,7,2.0,3.0
4,небәрі,27,1,5,21,7,79,6,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...
1038,жұмыскер,11,10,10,4,2,27,8,3.0,1.0
1039,алды (алд),175,7,337,18,29,0,10,3.0,1.0
1040,үміт,365,9,96,100,76,533,4,0.0,1.0
1041,құндылық,209,134,0,150,0,589,8,3.0,0.0


In [295]:
fdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   word                   1043 non-null   object 
 1   f1                     1043 non-null   object 
 2   f2                     1043 non-null   object 
 3   f3                     1043 non-null   object 
 4   f4                     1043 non-null   object 
 5   f5                     1043 non-null   object 
 6   f6                     1043 non-null   object 
 7   number_of_letters      1043 non-null   int64  
 8   chatgpt_score_encoded  1043 non-null   float64
 9   score_encoded          1043 non-null   float64
dtypes: float64(2), int64(1), object(7)
memory usage: 81.6+ KB


In [296]:
fdf = fdf.astype({'f1': 'int64',
                    'f2': 'int64',
                    'f3': 'int64',
                    'f4': 'int64',
                    'f5': 'int64',
                    'f6': 'int64'})

In [297]:
fdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   word                   1043 non-null   object 
 1   f1                     1043 non-null   int64  
 2   f2                     1043 non-null   int64  
 3   f3                     1043 non-null   int64  
 4   f4                     1043 non-null   int64  
 5   f5                     1043 non-null   int64  
 6   f6                     1043 non-null   int64  
 7   number_of_letters      1043 non-null   int64  
 8   chatgpt_score_encoded  1043 non-null   float64
 9   score_encoded          1043 non-null   float64
dtypes: float64(2), int64(7), object(1)
memory usage: 81.6+ KB


In [298]:
fdf[fdf.word.duplicated()]

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,number_of_letters,chatgpt_score_encoded,score_encoded
19,үйір,37,17,1,5,3,19,4,0.0,2.0
20,үйір,20,8,0,5,1,11,4,0.0,2.0
23,құрт,8,5,12,10,3,24,4,0.0,1.0
36,хош,3,0,3,0,0,3,3,0.0,3.0
54,еңсе,0,0,0,0,0,3,4,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...
1013,тәжірибе,3,0,0,0,0,0,8,3.0,1.0
1020,тең,4,0,1,0,20,105,3,0.0,1.0
1026,ата,503,0,610,414,456,518,3,0.0,0.0
1030,дөңгелек,155,7,18,10,6,31,8,3.0,0.0


In [299]:
fdf[fdf.word == 'үйір']

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,number_of_letters,chatgpt_score_encoded,score_encoded
18,үйір,21,11,0,10,7,55,4,0.0,2.0
19,үйір,37,17,1,5,3,19,4,0.0,2.0
20,үйір,20,8,0,5,1,11,4,0.0,2.0


There are duplicates of words since there are words with same spelling but different part of speech.
I decided to sum up their features.

In [300]:
grouped = fdf.groupby(by = 'word',as_index=False)[['f1', 'f2', 'f3', 'f4', 'f5', 'f6']].sum()
grouped[grouped['word'] == 'үйір']

Unnamed: 0,word,f1,f2,f3,f4,f5,f6
857,үйір,78,36,1,20,11,85


In [301]:
fdf.drop(fdf[fdf.word.duplicated()].index, inplace=True)

In [302]:
fdf.word.duplicated().sum()

0

In [303]:
grouped.word.duplicated().sum()

0

In [304]:
fdf.drop(['f1', 'f2', 'f3', 'f4', 'f5', 'f6'], axis=1, inplace=True)
fdf

Unnamed: 0,word,number_of_letters,chatgpt_score_encoded,score_encoded
0,прагматикалық,13,4.0,3.0
1,археологиялық,13,4.0,3.0
2,думан,5,2.0,2.0
3,графика,7,2.0,3.0
4,небәрі,6,2.0,2.0
...,...,...,...,...
1038,жұмыскер,8,3.0,1.0
1039,алды (алд),10,3.0,1.0
1040,үміт,4,0.0,1.0
1041,құндылық,8,3.0,0.0


In [305]:
final_fdf = pd.merge(fdf, grouped, on='word')
final_fdf

Unnamed: 0,word,number_of_letters,chatgpt_score_encoded,score_encoded,f1,f2,f3,f4,f5,f6
0,прагматикалық,13,4.0,3.0,0,0,0,8,0,116
1,археологиялық,13,4.0,3.0,103,10,0,3,0,28
2,думан,5,2.0,2.0,100,0,6,10,7,14
3,графика,7,2.0,3.0,89,1,0,13,1,54
4,небәрі,6,2.0,2.0,27,1,5,21,7,79
...,...,...,...,...,...,...,...,...,...,...
911,жұмыскер,8,3.0,1.0,11,10,10,4,2,27
912,алды (алд),10,3.0,1.0,175,7,337,18,29,0
913,үміт,4,0.0,1.0,365,9,96,100,76,533
914,құндылық,8,3.0,0.0,209,134,0,150,0,589


In [306]:
final_fdf

Unnamed: 0,word,number_of_letters,chatgpt_score_encoded,score_encoded,f1,f2,f3,f4,f5,f6
0,прагматикалық,13,4.0,3.0,0,0,0,8,0,116
1,археологиялық,13,4.0,3.0,103,10,0,3,0,28
2,думан,5,2.0,2.0,100,0,6,10,7,14
3,графика,7,2.0,3.0,89,1,0,13,1,54
4,небәрі,6,2.0,2.0,27,1,5,21,7,79
...,...,...,...,...,...,...,...,...,...,...
911,жұмыскер,8,3.0,1.0,11,10,10,4,2,27
912,алды (алд),10,3.0,1.0,175,7,337,18,29,0
913,үміт,4,0.0,1.0,365,9,96,100,76,533
914,құндылық,8,3.0,0.0,209,134,0,150,0,589


In [307]:
final_fdf = final_fdf.reindex(columns=['word', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'number_of_letters', 'chatgpt_score_encoded', 'score_encoded'])

Final check:

In [308]:
final_fdf[final_fdf.word == 'үйір']

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,number_of_letters,chatgpt_score_encoded,score_encoded
18,үйір,78,36,1,20,11,85,4,0.0,2.0


In [309]:
final_fdf

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,number_of_letters,chatgpt_score_encoded,score_encoded
0,прагматикалық,0,0,0,8,0,116,13,4.0,3.0
1,археологиялық,103,10,0,3,0,28,13,4.0,3.0
2,думан,100,0,6,10,7,14,5,2.0,2.0
3,графика,89,1,0,13,1,54,7,2.0,3.0
4,небәрі,27,1,5,21,7,79,6,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...
911,жұмыскер,11,10,10,4,2,27,8,3.0,1.0
912,алды (алд),175,7,337,18,29,0,10,3.0,1.0
913,үміт,365,9,96,100,76,533,4,0.0,1.0
914,құндылық,209,134,0,150,0,589,8,3.0,0.0


At this point, I was told that there are another feature that can be added

In [310]:
df7 = pd.read_csv('data/f7_added.csv')
df7

Unnamed: 0,1,5167,болу,28764,4604,9506,9549,7830,60253,47880,108133,788
0,2,22195,ол,14143,5271,1711,7039,4358,32522,30274,62796,749
1,3,20374,мен,18249,4500,4650,4450,3146,34995,23411,58406,511
2,4,7612,деу,16392,329,5973,4498,2480,29672,16528,46200,755
3,5,1046,алу,13294,3507,4382,2726,4231,28140,15219,43359,481
4,6,4804,беру,9156,2679,3761,5808,2569,23973,16322,40295,167
...,...,...,...,...,...,...,...,...,...,...,...,...
36088,36261,36243,эшелондық,-,-,-,-,-,-,1,1,0
36089,36262,36244,юморлы,-,-,-,1,-,1,-,1,0
36090,36263,36252,ядролық-химиялық,1,-,-,-,-,1,-,1,0
36091,36264,36263,ясауитанушы,-,-,-,-,-,-,1,1,0


In [311]:
first_row = pd.DataFrame([df7.columns], columns=df7.columns)
df7 = pd.concat([first_row, df7], ignore_index=True)
df7.columns = np.arange(0, df7.shape[1])

In [312]:
df7.drop([0, 1, 3, 4, 5, 6, 7, 8, 9, 10], axis=1, inplace=True)

In [313]:
df7.columns = ['word', 'f7']

In [314]:
final_fdf = pd.merge(final_fdf, df7)

In [315]:
final_fdf = final_fdf.loc[:, ['word', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'number_of_letters', 'chatgpt_score_encoded', 'score_encoded']]

In [316]:
final_fdf.score_encoded = final_fdf.score_encoded.astype('int')
final_fdf.chatgpt_score_encoded = final_fdf.chatgpt_score_encoded.astype('int')

In [317]:
final_fdf

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,f7,number_of_letters,chatgpt_score_encoded,score_encoded
0,прагматикалық,0,0,0,8,0,116,0,13,4,3
1,археологиялық,103,10,0,3,0,28,0,13,4,3
2,думан,100,0,6,10,7,14,3,5,2,2
3,графика,89,1,0,13,1,54,0,7,2,3
4,небәрі,27,1,5,21,7,79,0,6,2,2
...,...,...,...,...,...,...,...,...,...,...,...
1000,жұмыскер,11,10,10,4,2,27,0,8,3,1
1001,алды (алд),175,7,337,18,29,0,33,10,3,1
1002,үміт,365,9,96,100,76,533,3,4,0,1
1003,құндылық,209,134,0,150,0,589,5,8,3,0


At this point, I decided to generate a new feature from Kazakh Fasttext

In [318]:
from gensim.models.fasttext import load_facebook_model
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

fasttext_model = load_facebook_model("data/cc.kk.300.bin")

In [319]:
test_word = fasttext_model.wv["кітап"]

In [320]:
fasttext_model.wv.most_similar(test_word) 

[('кітап', 1.0),
 ('кітап.Кітап', 0.8097689747810364),
 ('кітап.', 0.7452438473701477),
 ('кітап-книга', 0.7342228293418884),
 ('кітапбы', 0.7339060306549072),
 ('кітап-', 0.7292510867118835),
 ('кітапа', 0.7270830273628235),
 ('кітапВ', 0.7246437072753906),
 ('кітап-книга-book', 0.7241933941841125),
 ('кітапы', 0.7238143682479858)]

In [321]:
def get_fasttext_embedding(word):
    return fasttext_model.wv[word] if word in fasttext_model.wv else None

def get_mean_embedding(words):
    vectors = [get_fasttext_embedding(word) for word in words]
    vectors = [v for v in vectors if v is not None]  
    return np.mean(vectors, axis=0)

In [327]:
A_words = final_fdf.word[(final_fdf.score_encoded == 0) | (final_fdf.score_encoded == 1)].tolist()
B_words = final_fdf.word[(final_fdf.score_encoded == 2) | (final_fdf.score_encoded == 3)].tolist()
C_words = final_fdf.word[final_fdf.score_encoded == 4].tolist()

In [328]:
A_mean = get_mean_embedding(A_words)
B_mean = get_mean_embedding(B_words)
C_mean = get_mean_embedding(C_words)

In [329]:
A_fts = [cosine_similarity([A_mean], [get_fasttext_embedding(i)])[0][0] for i in final_fdf.word]
B_fts = [cosine_similarity([B_mean], [get_fasttext_embedding(i)])[0][0] for i in final_fdf.word]
C_fts = [cosine_similarity([C_mean], [get_fasttext_embedding(i)])[0][0] for i in final_fdf.word]

final_fdf['fasttext_A_sim'] = pd.DataFrame(A_fts)
final_fdf['fasttext_B_sim'] = pd.DataFrame(B_fts)
final_fdf['fasttext_C_sim'] = pd.DataFrame(C_fts)
final_fdf

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,f7,number_of_letters,chatgpt_score_encoded,score_encoded,fasttext_A_sim,fasttext_B_sim,fasttext_C_sim
0,прагматикалық,0,0,0,8,0,116,0,13,4,3,0.245056,0.343533,0.348651
1,археологиялық,103,10,0,3,0,28,0,13,4,3,0.311971,0.369599,0.315624
2,думан,100,0,6,10,7,14,3,5,2,2,0.350175,0.371274,0.295582
3,графика,89,1,0,13,1,54,0,7,2,3,0.232166,0.242722,0.273367
4,небәрі,27,1,5,21,7,79,0,6,2,2,0.344129,0.267874,0.243401
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,жұмыскер,11,10,10,4,2,27,0,8,3,1,0.371985,0.364102,0.274501
1001,алды (алд),175,7,337,18,29,0,33,10,3,1,0.532238,0.562550,0.534421
1002,үміт,365,9,96,100,76,533,3,4,0,1,0.326001,0.404425,0.327951
1003,құндылық,209,134,0,150,0,589,5,8,3,0,0.354230,0.425133,0.332415


In [348]:
test_word = get_fasttext_embedding('мен')

In [349]:
print(f'A-similarity: {cosine_similarity([A_mean], [test_word])[0][0]}\nB-similarity: {cosine_similarity([B_mean], [test_word])[0][0]}\nC-similarity: {cosine_similarity([C_mean], [test_word])[0][0]}')

A-similarity: 0.47599199414253235
B-similarity: 0.4272915720939636
C-similarity: 0.38814613223075867


In [352]:
test_word = get_fasttext_embedding('көкет')

In [353]:
print(f'A-similarity: {cosine_similarity([A_mean], [test_word])[0][0]}\nB-similarity: {cosine_similarity([B_mean], [test_word])[0][0]}\nC-similarity: {cosine_similarity([C_mean], [test_word])[0][0]}')

A-similarity: 0.24075892567634583
B-similarity: 0.25957706570625305
C-similarity: 0.33527156710624695


In [330]:
final_fdf = final_fdf[[c for c in final_fdf.columns if c != 'score_encoded'] + ['score_encoded']]
final_fdf

Unnamed: 0,word,f1,f2,f3,f4,f5,f6,f7,number_of_letters,chatgpt_score_encoded,fasttext_A_sim,fasttext_B_sim,fasttext_C_sim,score_encoded
0,прагматикалық,0,0,0,8,0,116,0,13,4,0.245056,0.343533,0.348651,3
1,археологиялық,103,10,0,3,0,28,0,13,4,0.311971,0.369599,0.315624,3
2,думан,100,0,6,10,7,14,3,5,2,0.350175,0.371274,0.295582,2
3,графика,89,1,0,13,1,54,0,7,2,0.232166,0.242722,0.273367,3
4,небәрі,27,1,5,21,7,79,0,6,2,0.344129,0.267874,0.243401,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000,жұмыскер,11,10,10,4,2,27,0,8,3,0.371985,0.364102,0.274501,1
1001,алды (алд),175,7,337,18,29,0,33,10,3,0.532238,0.562550,0.534421,1
1002,үміт,365,9,96,100,76,533,3,4,0,0.326001,0.404425,0.327951,1
1003,құндылық,209,134,0,150,0,589,5,8,3,0.354230,0.425133,0.332415,0


In [331]:
final_fdf.to_csv('data/preprocessed_data.csv')