In [None]:
import pandas as pd
df = pd.read_csv("s3://amos-training-data/100k_historic_enriched.csv")
categories_order = ['XS', 'S', 'M', 'L','XL']
df.dtypes

In [None]:
print(len(df))
df.groupby(by=['MerchantSizeByDPV']).size()/len(df)

In [None]:
# Check if there are repetitions (> 0 => leads that exist multiple times according to the identifier)
identifier = df[['Phone','Email','Company Name','number_formatted','google_places_place_id','google_places_formatted_address','google_places_name','google_places_detailed_website']]
for col in identifier:
    print(f'{col}: {len(df[col].unique())} ({1-len(df[col].unique())/df[col].count()})')    

In [None]:
""" Regionalatlas: Placeholder
2222222222: nichts vorhanden, genau 0
5555555555: Zahlenwert unbekannt oder geheim zu halten
6666666666: Tabellenfach gesperrt, da Aussage nicht sinnvoll
7777777777: keine Angabe, da Zahlenwert nicht sicher genug
8888888888: Angabe fällt später an
"""

exclude_values = [2222222222.0, 5555555555.0, 6666666666.0, 7777777777.0, 8888888888.0]
regional_df = df.filter(like='regional', axis=1).dropna()

# Dictionary to know which columns and indices have problematic values
rem_dic = {}
columns = []

filter_df = regional_df.copy()

for exc in exclude_values:
    # Find all columns that have those values we need to exclude
    col = regional_df.loc[:,(np.sum(df == exc,axis=0)>0)].columns.tolist()

    columns+=col
    
    
    # Now we can use those columns to find the corresponding rows
    for c in col:
        indices = regional_df.loc[(np.sum(df == exc,axis=1)>0),col].index.tolist()        
            
        rem_dic[c] = {str(exc):indices}
        
        filter_df = filter_df[df[c]!=exc]
        print(f'column:{c}, value:{exc}')
        
print(rem_dic)

In [None]:
# Irregular values defined by regionalatlas needs to be translated to nan so we can handle it later on
import numpy as np
regional_atlas = [col for col in df if col.startswith('regional_atlas')]

print("Changed the following features, because of irregular values of regionalatlas:")
for col in regional_atlas:
    n_irr = (df[col]>=2222222222).sum()
    n = (df[col].notnull()).sum()
    
    if (n_irr>0):
        print(col+': '+str(n_irr)+' out of '+ str(n))
    df[col] = np.where(df[col] >= 2222222222, np.nan, df[col])


In [None]:
isna = sum(df['google_places_place_id_matches_phone_search'].isna())
print(f'Empty: {isna}')
print(f'Not empty: {df.shape[0]-isna}')

In [None]:
print(df.groupby('MerchantSizeByDPV').count()['number_area'].reindex(categories_order))

tmp = df[df['number_country']=='Germany'].groupby('MerchantSizeByDPV').count()
(tmp / (sum(tmp['Last Name'].values)/129))['First Name'].reindex(categories_order)

In [None]:
min_max_df = df.agg({
'google_places_user_ratings_total':["min","max"],
'google_places_rating':["min","max"],
'google_places_price_level':["min","max"],
'reviews_sentiment_score':["min","max"],
'regional_atlas_age_0':["min","max"],
'regional_atlas_age_1':["min","max"],
'regional_atlas_age_2':["min","max"],
'regional_atlas_age_3':["min","max"],
'regional_atlas_age_4':["min","max"],
'regional_atlas_per_service_sector':["min","max"],
'regional_atlas_per_trade':["min","max"],
'regional_atlas_employment_rate':["min","max"],
'regional_atlas_unemployment_rate':["min","max"],
'regional_atlas_per_long_term_unemployment':["min","max"],
'regional_atlas_pop_density':["min","max"],
'regional_atlas_pop_development':["min","max"],
'regional_atlas_pop_avg_age':["min","max"],
'regional_atlas_investments_p_employee':["min","max"],
'regional_atlas_gross_salary_p_employee':["min","max"],
'regional_atlas_disp_income_p_inhabitant':["min","max"],
'regional_atlas_tot_income_p_taxpayer':["min","max"],
'regional_atlas_gdp_p_employee':["min","max"],
'regional_atlas_gdp_development':["min","max"],
'regional_atlas_gdp_p_inhabitant':["min","max"],
'regional_atlas_gdp_p_workhours':["min","max"],
'regional_atlas_pop_avg_age_zensus':["min","max"],
'regional_atlas_regional_score':["min","max"]
})

# Apply the function for each column
for col in min_max_df.columns:
    min_feature = min_max_df[col]['min']
    max_feature = min_max_df[col]['max']
    print(f'{col}: [{min_feature}, {max_feature}]')    

In [None]:
from scipy.stats import percentileofscore

percentile_col = [
'regional_atlas_age_0',
'regional_atlas_age_1',
 'regional_atlas_age_2',
'regional_atlas_age_3',
'regional_atlas_age_4',
'google_places_user_ratings_total','google_places_rating','reviews_sentiment_score','regional_atlas_pop_density',
'regional_atlas_pop_development',
'regional_atlas_pop_avg_age',
'regional_atlas_per_service_sector',
'regional_atlas_per_trade',
'regional_atlas_employment_rate',
'regional_atlas_unemployment_rate',
'regional_atlas_per_long_term_unemployment',
'regional_atlas_investments_p_employee',
'regional_atlas_gross_salary_p_employee',
'regional_atlas_disp_income_p_inhabitant',
'regional_atlas_tot_income_p_taxpayer',
'regional_atlas_gdp_p_employee',
'regional_atlas_gdp_development',
'regional_atlas_gdp_p_inhabitant',
'regional_atlas_gdp_p_workhours',
'regional_atlas_pop_avg_age_zensus',
'regional_atlas_regional_score']

for col in percentile_col:
    no_nan = df[col][df[col].notnull()]
    col_name = col+'_percentiles'    
    df[col_name] = no_nan.apply(lambda x: percentileofscore(no_nan, x))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Adding the percentiles as columns for analysis and report

for col in percentile_col:
    feature = col+"_percentiles"
    not_nan = df[feature].notnull()

    classes = df['MerchantSizeByDPV'].unique()

    for c in classes:
        sns.kdeplot(df[not_nan][df[not_nan]['MerchantSizeByDPV']==c][feature], fill=False, label=c)
        
    # Add labels and title
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.title('Distribution of '+col)
    plt.legend()

    # Show the plot
    plt.show()

In [None]:
for c in classes:
    tmp = df[not_nan][df[not_nan]['MerchantSizeByDPV']==c]
    sns.kdeplot(x=tmp['google_places_user_ratings_total_percentiles'], y=tmp['google_places_rating_percentiles'], fill=False, label=c)

    plt.xlabel('ratings_total')
    plt.ylabel('rating_avg')
    plt.title('Distribution of '+c)
    plt.show()

In [None]:
import numpy as np
arr_false = {}

for column in df:
    
    if df[column].dtype == bool:
        false_count = np.count_nonzero(df[column] == False)
        arr_false[column] = false_count
    
print(arr_false)

In [None]:
"""
Dangerous to come to a conclusion based on Gender.
"""
import gender_guesser.detector as gender
gd = gender.Detector()
df['Gender'] = df['First Name'].apply(str.capitalize).map(lambda x: gd.get_gender(x))

group_feature = 'Gender' # MerchantSizeByDPV or Gender
total_counts = df[group_feature].value_counts().reset_index(name='total_count')
total_counts = total_counts.rename(columns={'index':group_feature})
grouped_counts = df.groupby(['Gender', 'MerchantSizeByDPV']).size().reset_index(name='count')

result = pd.merge(grouped_counts, total_counts, on=group_feature)
result['proportion'] = result['count'] / result['total_count']

category_order = ['XS','S','M','L','XL']


# Create separate DataFrames for each gender
# For better depiction .drop(index='XS') and take away XS from category_order
# andy: androgynous
andy_data = result[result['Gender'] == 'andy'].set_index('MerchantSizeByDPV')['proportion']
unknown_data = result[result['Gender'] == 'unknown'].set_index('MerchantSizeByDPV')['proportion']
mostly_female_data = result[result['Gender'] == 'mostly_female'].set_index('MerchantSizeByDPV')['proportion']
mostly_male_data = result[result['Gender'] == 'mostly_male'].set_index('MerchantSizeByDPV')['proportion']
male_data = result[result['Gender'] == 'male'].set_index('MerchantSizeByDPV')['proportion']
female_data = result[result['Gender'] == 'female'].set_index('MerchantSizeByDPV')['proportion']

# Plotting
plt.plot(category_order, andy_data, label='Andy')
plt.plot(category_order, unknown_data, label='Unknown')
plt.plot(category_order, mostly_female_data, label='Mostly Female')
plt.plot(category_order, mostly_male_data, label='Mostly Male')
plt.plot(category_order, male_data, label='Male')
plt.plot(category_order, female_data, label='Female')

# Set labels and title
plt.xlabel('MerchantSizeByDPV')
plt.ylabel('Proportion')
plt.title('Proportion of MerchantSizeByDPV for Each Gender')

# Display the plot
plt.legend()
plt.show()

In [None]:
mcc_group = df.groupby(by=['MCC Level','MerchantSizeByDPV']).size()
grouped = mcc_group.unstack()
mcc_sum = mcc_group.groupby(level=0).sum()

mcc_df = pd.concat([grouped, sum_test], axis=1)
tmp = mcc_df[0]
mcc_df = mcc_df.divide(mcc_df[0], axis=0).sort_values(by='XS', ascending=True)
mcc_df['Sum'] = tmp

In [None]:
print('Dropped the rows due to less than 50 examples:')
print(mcc_df[mcc_df['Sum']<50].index.values)
mcc_df = mcc_df[mcc_df['Sum']>=50]

# Show every 10 categories (previously ordered by ascending XS), to compare the categories
# The first categories are the most attractive ones
for i in range(mcc_df.shape[0]):    
    if i % 10 == 0:
        mcc_df.drop([0,'Sum','XS'],axis=1)[i:(i+5)].transpose().plot.line()

In [None]:
import ast

data = df[df['google_places_detailed_type'].notnull()]
test = pd.Series([x for item in data.google_places_detailed_type for x in ast.literal_eval(item)]).value_counts()
test

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df['google_places_detailed_type'][df['google_places_detailed_type'].notna()]
docs = docs.apply(lambda row: ast.literal_eval(row))

vectorizer = CountVectorizer(analyzer=lambda x: x) # , min_df = 50
categories = vectorizer.fit_transform(docs).toarray()
vectorizer.get_feature_names_out()

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table for each feature
contingency_tables = {}

cat_col = df[['google_places_candidate_count_mail','google_places_candidate_count_phone','google_places_rating','google_places_price_level','google_places_confidence','MCC Level', 'Gender','number_area','first_name_in_account','last_name_in_account','google_places_business_status','number_country','number_valid','number_possible','google_places_place_id_matches_phone_search']].fillna('no_data')
cat_col['b_google_website'] = df['google_places_detailed_website'].notnull()

#for feature_column in df.columns[df.columns != 'label']:
for feature_column in cat_col.columns:
    contingency_table = pd.crosstab(df['MerchantSizeByDPV'], cat_col[feature_column])
    contingency_tables[feature_column] = contingency_table

# Perform chi-squared test for each feature
results = {}
for feature, table in contingency_tables.items():
    chi2_stat, p_value, dof, expected = chi2_contingency(table)
    results[feature] = {'Chi-squared stat': chi2_stat, 'P-value': p_value, 'Degrees of Freedom': dof}

# Display the results
for feature, result in results.items():
    print(f"\nChi-squared test for {feature}:")
    print(f"Chi-squared statistic: {result['Chi-squared stat']:.2f}")
    print(f"P-value: {result['P-value']:.4f}")
    print(f"Degrees of freedom: {result['Degrees of Freedom']}")

In [None]:
import matplotlib.pyplot as plt

def b_bayesian(df,bin_column,b_value=True):
    
    prior_A = df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]        
    prior_B = df[df[bin_column]==b_value].shape[0] / df[bin_column].shape[0]
    evidence_A = df[df[bin_column]==b_value].groupby('MerchantSizeByDPV').count()[bin_column] / df.groupby('MerchantSizeByDPV').count()[bin_column]
    posterior_B = (prior_A*evidence_A) / prior_B
        
    return posterior_B.reindex(index=['XS', 'S', 'M', 'L','XL'])

per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])


series_not_possible =b_bayesian(df,'number_possible',False)-per_size
series_invalid = b_bayesian(df,'number_valid',False)-per_size
series_first_name = b_bayesian(df,'first_name_in_account',True)-per_size
series_last_name = b_bayesian(df,'last_name_in_account',True)-per_size

series_possible =b_bayesian(df,'number_possible',True)-per_size
series_valid = b_bayesian(df,'number_valid',True)-per_size
series_no_first_name = b_bayesian(df,'first_name_in_account',False)-per_size
series_no_last_name = b_bayesian(df,'last_name_in_account',False)-per_size

# Ensure the 'Category' column is ordered
categories_order = ['XS', 'S', 'M', 'L','XL']

# Plot the lines
plt.figure(figsize=(10, 6))


plt.plot(categories_order, series_not_possible, label='Number not possible', marker='o')
plt.plot(categories_order, series_invalid, label='Number invalid', marker='d')
plt.plot(categories_order, series_first_name, label='First name in account')
plt.plot(categories_order, series_last_name, label='Last name in account')
plt.plot(categories_order, series_possible, label='Number possible')
plt.plot(categories_order, series_valid, label='Number valid')
plt.plot(categories_order, series_no_first_name, label='First name not in account')
plt.plot(categories_order, series_no_last_name, label='Last name not in account')
#plt.plot(categories_order, per_size, label='Percentage of merchant size', marker='s',c='black')


plt.title('Bayesian')
plt.xlabel('Categories')
plt.ylabel('Percentages')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


class_colors = sns.color_palette("colorblind")[:5]
regional_df = df.filter(like='regional', axis=1)
regional_df['MerchantSizeByDPV'] = df['MerchantSizeByDPV']

# Plot boxplots for each column with different MerchantSizeByDPV boxplots next to each other
for i, column in enumerate(regional_df.columns[:-1]):  # Exclude the last column ('MerchantSizeByDPV')    
    
    if column == 'regional_atlas_pop_development':        
        axes = sns.boxplot(x="MerchantSizeByDPV", hue ="MerchantSizeByDPV", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) 

    elif column == 'regional_atlas_gdp_development':
        axes = sns.boxplot(x="MerchantSizeByDPV", hue ="MerchantSizeByDPV", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) 
    
    else:
        axes = sns.boxplot(x="MerchantSizeByDPV", hue ="MerchantSizeByDPV", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])
        
    axes.set_title(f'Boxplot of {column}')
    axes.set_xlabel('MerchantSizeByDPV')
    axes.set_ylabel(column)    
    
    median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()
    axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')
    axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')
    
    plt.show()

In [None]:
# Same like the boxplots but now with violinplots
for column in regional_df.filter(like='regional', axis=1).columns:   
    if column == 'regional_atlas_pop_development':        
        axes = sns.violinplot(x="MerchantSizeByDPV", hue ="MerchantSizeByDPV", y=column, data=regional_df[regional_df['regional_atlas_pop_development']<2000],palette=class_colors, order=['XS', 'S','M','L','XL']) 

    elif column == 'regional_atlas_gdp_development':
        axes = sns.violinplot(x="MerchantSizeByDPV", hue ="MerchantSizeByDPV", y=column, data=regional_df[regional_df['regional_atlas_gdp_development']<60],palette=class_colors, order=['XS', 'S','M','L','XL']) 
    
    else:
        axes = sns.violinplot(x="MerchantSizeByDPV", hue ="MerchantSizeByDPV", y=column, data=regional_df,palette=class_colors, order=['XS', 'S','M','L','XL'])
        
    axes.set_title(f'Boxplot of {column}')
    axes.set_xlabel('MerchantSizeByDPV')
    axes.set_ylabel(column)    
    
    median_value = regional_df[regional_df['MerchantSizeByDPV'] == 'XL'][column].median()
    axes.axhline(y=median_value, color='red', linestyle='--', label=f'Median (XL)')
    axes.legend(bbox_to_anchor=(1.05, 0.5), loc='upper right')
    
    plt.show()

In [None]:
from sklearn import preprocessing

# Normalize the features before comparing / dividing them
x = regional_df.drop('MerchantSizeByDPV', axis = 1).values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
norm_regio =  pd.DataFrame(x_scaled, columns=regional_df.drop('MerchantSizeByDPV', axis = 1).columns)

# Compute the stats of the normalized regional data, to find a heuristic to evaluate the features' discriminative magnitudes
df_stats_XL = norm_regio[regional_df['MerchantSizeByDPV']=='XL'].describe()
df_stats_XS = norm_regio[regional_df['MerchantSizeByDPV']=='XS'].describe()

((df_stats_XL.loc['50%'] - df_stats_XS.loc['50%'])/(df_stats_XL.loc['75%'] - df_stats_XL.loc['25%'])).sort_values(ascending=False)

In [None]:
import pandas as pd
import numpy as np

# Compute a correlation matrix for all float values of our dataframe
float_cols = df.columns[df.dtypes==float]
corr_matrix = df[float_cols].corr()

# The diagonal values (correlation of each feature with itself) should be considered 0, to filter them out
np.fill_diagonal(corr_matrix.values, 0)

# Create a new DataFrame that transforms all values to 0 that are below a value of defined by variable "correlation_threshold" 
correlation_threshold = 0.89
filtered_correlation_df = corr_matrix.applymap(lambda x: x if abs(x) >= correlation_threshold else 0)

# Identify the rows and columns that not only consists of 0 values (after filtering)
non_zero_rows = filtered_correlation_df.index[~(filtered_correlation_df == 0).all(axis=1)]
non_zero_columns = filtered_correlation_df.columns[~(filtered_correlation_df == 0).all(axis=0)]
new_correlation_df = filtered_correlation_df.loc[non_zero_rows, non_zero_columns]

# Print the new correlation matrix and the corresponding plot
print(f"New Correlation Matrix (values greater than {correlation_threshold}):")

plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(new_correlation_df, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.savefig('correlation_matrix.svg', format='svg')
plt.show()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


reg_df = df.filter(like='regional', axis=1).dropna()

# Standardize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(reg_df.drop('MerchantSizeByDPV', axis=1))

# Apply PCA
pca = PCA()
principal_components = pca.fit_transform(scaled_data)

# Retrieve explained variance ratios
explained_variance_ratio = pca.explained_variance_ratio_

components = pd.DataFrame(pca.components_, columns=filter_df.columns)

# Print explained variance ratios
for i, ratio in enumerate(explained_variance_ratio, 1):
    print(f"Principal Component {i}: Explained Variance Ratio = {ratio:.4f}")

# Plot the cumulative explained variance
cumulative_variance = explained_variance_ratio.cumsum()

plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Variance Explained')
plt.show()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Count only those words, existing in a minum amount of 100 email adresses
count_vectorizer = CountVectorizer(min_df=50)

# Fit and transform the text data
count_matrix = count_vectorizer.fit_transform(df['Email'])

# Convert the matrix to a DataFrame for better readability
count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

In [None]:
common_words = pd.DataFrame(count_df.sum()).transpose()

for word in common_words:
    print(word)

In [None]:
# Names
names = []

# Weird terms
weird_terms = []
    

In [None]:
grouped_common_words = []
per_size = (df.groupby('MerchantSizeByDPV').count()['Email']/df.shape[0]).reindex(index=['XS', 'S', 'M', 'L','XL'])

for word in common_words.drop(weird_terms,axis=1): #common_words[names], common_words[weird_terms]
    
    indices= count_df[count_df[word]>0].index      
    per_word = (df.loc[indices].groupby('MerchantSizeByDPV').count()['Email']/len(df.loc[indices])).reindex(index=['XS', 'S', 'M', 'L','XL']) 
    
    grouped_common_words.append((per_word-per_size).rename(word))  
    
common_df = pd.concat(grouped_common_words, axis=1)
common_df = common_df.transpose()

common_df.describe()

In [None]:
# The min/mean/max probability decrease (-) or increase (+) by a value of x with the existence of a certain common word

print(f'{np.min(common_df["XS"])}, {np.mean(common_df["XS"])},{np.max(common_df["XS"])}')
print(f'{np.min(common_df["S"])}, {np.mean(common_df["S"])},{np.max(common_df["S"])}')
print(f'{np.min(common_df["M"])}, {np.mean(common_df["M"])},{np.max(common_df["M"])}')
print(f'{np.min(common_df["L"])}, {np.mean(common_df["L"])},{np.max(common_df["L"])}')
print(f'{np.min(common_df["XL"])}, {np.mean(common_df["XL"])},{np.max(common_df["XL"])}')

In [None]:
import multiprocessing
import time
import pandas as pd
import numpy as np
from deutschland.bundesanzeiger import Bundesanzeiger
import pickle
import time

def access_ba(company,b_bundesanzeiger,):

    b_bundesanzeiger.append(True)
    try:
        ba = Bundesanzeiger()
        data = ba.get_reports(company)
    except:
        b_bundesanzeiger[-1] = False
    return

    if __name__ == '__main__':

    """
    with open('list_file.pkl', 'rb') as file:
        loaded_list = pickle.load(file)
        print(loaded_list)
    """

    pd.set_option('display.max_columns', None)

    historic = pd.read_csv('historic.csv',sep = ',')#_enriched

    df = historic.groupby('MerchantSizeByDPV').apply(lambda x: x.sample(100))


    with multiprocessing.Manager() as manager:

        b_bundesanzeiger = manager.list()
        content_array = []
        durations = []

        for i, company in enumerate(df["Company Name"]):

            print(i)

            start = time.time()

            # Start access_ba as a process
            p = multiprocessing.Process(target=access_ba, name="access_ba", args=(company,b_bundesanzeiger))

            p.start()

            # Wait 8 seconds for access_ba	
            p.join(8)

            # If thread is active
            if p.is_alive():
                print ("Terminate access_ba")

                # Terminate access_ba
                p.terminate()
                b_bundesanzeiger[-1] = 'killed'

            # Cleanup
            p.join()
            i+=1

            print(b_bundesanzeiger[-1])
            end = time.time()
            print(end-start)
            print()
            durations.append(end-start)

            """if i==100:
                with open('list_file.pkl', 'wb') as file:
                    pickle.dump(list(b_bundesanzeiger), file)
                print(np.mean(np.array(list(b_bundesanzeiger))))
                break
            """

        with open('list_file.pkl', 'wb') as file:
            pickle.dump(list(b_bundesanzeiger), file)

        with open('time.pkl', 'wb') as file:
            pickle.dump(durations, file)

        df.to_pickle("./dataframe_sample.pkl")

In [None]:
with open('dataframe_sample.pkl', 'rb') as f:
    df = pickle.load(f)

df = df.reset_index(drop=True)

with open('list_file.pkl', 'rb') as f:
    mynewlist = pickle.load(f)

with open('time.pkl', 'rb') as f:
    time = pickle.load(f)

df_stats = pd.DataFrame({'b_bundesanzeiger': mynewlist, 'time': time})

df['b_bundesanzeiger'] = df_stats['b_bundesanzeiger']
df['time'] = df_stats['time']


counts =df.groupby('MerchantSizeByDPV')['b_bundesanzeiger'].value_counts()

desired_value_counts = counts.unstack().fillna(0)

# Compute total counts per category
total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()

# Compute probability for each category
probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)

print(probabilities)

In [None]:
df['b_google_places'] = df["google_places_place_id"].notnull()
counts =df.groupby('MerchantSizeByDPV')['b_google_places'].value_counts()

desired_value_counts = counts.unstack().fillna(0)

# Compute total counts per category
total_counts_per_category = counts.groupby('MerchantSizeByDPV').sum()

# Compute probability for each category
probabilities = desired_value_counts.apply(lambda x: x / total_counts_per_category)

print(probabilities)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# Separate features (X) and target variable (y)
table = df[regional_columns+['MerchantSizeByDPV']].dropna()
y = table['MerchantSizeByDPV']
X=table[regional_columns]

X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=4000, class_weight='balanced')

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Assuming X and y are your feature matrix and target variable
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Create and fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Display evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Assuming X and y are your feature matrix and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the set of rare classes
rare_classes = ['XL']  # Replace with the actual class labels you consider rare
    # MAYBE NOT ONLY XL, but also L and M
# Create a binary target variable indicating whether each instance is rare or not
y_train_rare = y_train.isin(rare_classes).astype(int)
y_test_rare = y_test.isin(rare_classes).astype(int)

# Create and fit the Isolation Forest model
if_model = IsolationForest(contamination='auto')
if_model.fit(X_train)

# Predict anomalies on the test set
y_pred_rare = if_model.predict(X_test)

# Convert the predicted labels to binary (1 for anomalies, -1 for normal instances)
y_pred_rare_binary = (y_pred_rare == -1).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test_rare, y_pred_rare_binary)
conf_matrix = confusion_matrix(y_test_rare, y_pred_rare_binary)
class_report = classification_report(y_test_rare, y_pred_rare_binary)

# Display evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=[0,1], yticklabels=[0,1])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
