In [113]:
import pandas as pd
import plotly as py 
import plotly.graph_objs as go
import plotly.express as px
import kaleido
import os
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np 
import math as m
import time

from scipy import stats

from sklearn import datasets
from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter
# setup offline mode 
py.offline.init_notebook_mode(connected=True)

pd.set_option('display.max_rows', 20)

In [27]:
def word_to_vector(word: str) -> np.array:
    array = np.zeros(len(CZ_ALPHABET))
    lst = []
    for i, alph in enumerate(CZ_ALPHABET):
        array[i] = word.count(alph)
        lst.append(word.count(alph))
    return pd.Series(lst)

def get_str_substrings(string: str, low: int = 1, high: int = None) -> list:
    if high is None:
        high = len(string)
    substrings = []
    for i in range(0, len(string)):
        for k in range(0, len(string) - i):
            substring = string[k:i+k+1]
            if low <= len(substring) <= high: substrings.append(substring)
    return substrings


# Method 1: Alphabet vector

In [28]:
data = pd.read_csv('data.csv', encoding='ansi', usecols=['Obec', 'Okres', 'Kraj', 'Latitude', 'Longitude'])

In [29]:
data['Obec'] = data['Obec'].str.lower()

In [30]:
CZ_ALPHABET = ['a', 'á', 'b', 'c', 'č', 'd', 'ď',
               'e', 'é', 'ě', 'f', 'g', 'h', 'ch', 
               'i', 'í', 'j', 'k', 'l', 'm', 'n', 
               'ň', 'o', 'ó', 'p', 'q', 'r', 'ř',
               's', 'š', 't', 'ť', 'u', 'ú','ů', 
               'v', 'w', 'x', 'y', 'ý','z', 'ž', ' ']

#abcčdďeéfghiíjklmnňopqrřsštťuúůvwxyýzž

In [31]:
data = pd.read_csv('data.csv', encoding='ansi', usecols=['Obec', 'Okres', 'Kraj', 'Latitude', 'Longitude'])

data[CZ_ALPHABET] = data['Obec'].apply(word_to_vector)

In [32]:
# define x and y
x = data[CZ_ALPHABET]
y = data['Kraj']

x_train, x_test, y_train, y_test= train_test_split(x, y,
                                                   test_size = 0.2,
                                                   shuffle = True, #shuffle the data to avoid bias
                                                   random_state = 0)

In [33]:
K = 5

scaler = Normalizer().fit(x_train) # the scaler is fitted to the training set
normalized_x_train = scaler.transform(x_train) # the scaler is applied to the training set
normalized_x_test = scaler.transform(x_test) # the scaler is applied to the test set

knn = KNeighborsClassifier(K)
knn.fit(normalized_x_train, y_train)
y_pred_sklearn = knn.predict(normalized_x_test)
y_pred_sklearn = pd.Series(y_pred_sklearn)

#TFIDF
accuracy_score(y_test, y_pred_sklearn)

0.12939297124600638

# Method 2: Linguistic parameters 

In [34]:
data['Délka'] = data['Obec'].apply(len)

# Method 3: Automatic linguistic parameters

In [109]:
def tfidf(len_doc_occ: int, len_doc: int, N: int, D: int) -> float:
    tf =  len_doc_occ / len_doc
    idf = N / D
    return tf * idf


data = pd.read_csv('data.csv', encoding = 'ansi', usecols=['Obec', 'Kraj'])
data['Obec'] = data['Obec'].str.lower()
data['substring'] = data['Obec'].apply(lambda string: get_str_substrings(string=string, low=2, high=None))

corpus = data[['substring', 'Kraj']].explode(column='substring')
grouping = corpus.groupby(corpus['substring'])

df_doc_sizes = corpus.groupby(corpus['Kraj']).apply(lambda x: x['substring'].shape[0])

st = time.time()
print('Calculating total frequency of substrings.')
df_substring_stats = grouping.apply(len).reset_index().rename(columns={0 : 'freq'})
df_substring_stats['freq'] = pd.to_numeric(df_substring_stats['freq'])

print('Calculating the number of regions in which the substring occurs.')
df_substring_stats['n_krajů'] = pd.to_numeric(grouping.apply(lambda x: len(x['Kraj'].unique())).reset_index(drop=True))
print(df_substring_stats.columns)
print('Writing out the regions.')
df_substring_stats['kraje'] = grouping.apply(lambda x: x['Kraj'].tolist()).reset_index(drop=True)

print('Calculating mode and count of the regions (for max tfidf).')
df_substring_stats[['kraj_max', 'kraj_max_count']] = df_substring_stats['kraje'].apply(lambda x:
                                                                                       list(np.concatenate(stats.mode(x)))
                                                                                       ).tolist()

df_substring_stats['kraj_max_count'] = pd.to_numeric(df_substring_stats['kraj_max_count'])

print('Calculating the size of the mode region documents.')
df_substring_stats['kraj_size'] = pd.to_numeric(df_substring_stats['kraj_max'].apply(lambda x: df_doc_sizes[x]))

print('Calculating maximum tf.')
df_substring_stats['tf'] = df_substring_stats['kraj_max_count'] / df_substring_stats['kraj_size']

print('Calculating idf')
df_substring_stats['idf'] = np.log2(14 / df_substring_stats['n_krajů'])

print('Calculating max tfidf')
df_substring_stats['max_tfidf'] = df_substring_stats['tf'] * df_substring_stats['idf']

print(f'Done in {time.time() - st:.2f} seconds')
print('Saving to csv.')
df_substring_stats.to_csv('substring_stats.csv')

print('Done saving.')
df_substring_stats

Calculating total frequency of substrings.
Calculating the number of regions in which the substring occurs.
Index(['substring', 'freq', 'n_krajů'], dtype='object')
Writing out the regions.
Calculating mode and count of the regions (for max tfidf).
Calculating the size of the mode region documents.
Calculating maximum tf.
Calculating idf
Calculating max tfidf
Done in 17.43 seconds
Saving to csv.
Done saving.


KeyError: 'tfidf'

In [118]:
df_substring_stats.sort_values(by='max_tfidf', ascending=False)#[['substring', 'max_tfidf']]

In [86]:
lst = [1, 2, 4, 2, 5, 2]
df = pd.DataFrame()
df['null'] = 0
df[['a', 'b']] = list(np.concatenate(stats.mode(lst)).ravel())

Unnamed: 0,null,a,b


In [36]:

jmena = data['Obec'].to_list()
pismena = [jmeno[0] for jmeno in jmena]
print(data[data['Obec'] == 'X'])
px.histogram(pismena)

Empty DataFrame
Columns: [Obec, Kraj, substring]
Index: []


In [37]:
px.scatter(data, x='Longitude', y='Latitude', color='Délka')

px.bar(data.groupby(data['Kraj'])['Délka'].apply(np.mean))

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['Obec', 'Kraj', 'substring'] but received: Longitude

In [None]:
px.scatter(data[data['Obec'].str.contains('anov')], x='Longitude', y='Latitude', custom_data=['Obec'])