In [267]:
from scipy.stats import chi2_contingency
import calendar
import numpy as np
months = list(map(lambda x: x.lower(), calendar.month_name))[1:]
import pandas as pd
from pandas import DataFrame


from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

import matplotlib.pylab as pylab




params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 10),
          'axes.labelsize': 'x-large',
          'axes.titlesize': 'x-large',
          'xtick.labelsize': 'x-large',
          'ytick.labelsize': 'x-large'}
pylab.rcParams.update(params)

import os

project_folder = f'{os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))}/data/process'
import warnings

warnings.filterwarnings(action='ignore')

from IPython.core.display import display, HTML
# Set the display width to fit the entire notebook width
display(HTML("<style>.container { width:100% !important; }</style>"))
from IPython.display import Markdown
from IPython.display import display_html
import scipy.stats as stats

predictors ={'Q13':'value_approx','Q14':'is_electric','Q15':'bicycle_type','Q18':'is_recover',  'Q28':'seasons', 'Q29':'purpose', 'age_groups': 'age_groups', 'Q35': 'gender', 'Q36': 'income','Q37':'nm_bikes', 'Q38': 'education','country':'country'}
dependent_vars = {'Q25':'is_replaced','Q30':'mode_alt', 'Q31':'post_act'}
all_vars = predictors.copy()
all_vars.update(dependent_vars)

<span style="color: red;font-size: 50px">RUN THE NEXT CELL FOR THE FIRST TIME ONLY</span>

In [268]:

# Change col names and leave only relevant cols as well as delete unnecessary spaces and parenthesis (except age group)
cols_names = list(all_vars.values())
data_init = pd.read_csv(f'{project_folder}/new_data/new_data_2.csv')
temp = data_init['age_groups']
d_analysis = data_init.rename(columns=all_vars)[cols_names].astype(str).replace(r"\(.*?\)", "").astype(str).replace(":", "").apply(lambda row: [d.split('(')[0].strip() if '(' in d else d.strip() for d in row])
d_analysis['age_groups'] = temp


# This method translate the alternative data into how much the alternative is sustainable
def sus_alter(row):
    alt_stat = ['active','not active']
    if row in  ['Walk','Cycle, personal bicycle','Cycle, rental bicycle','Cycle, public bike share','Micro mobility']:
        return alt_stat[0]
    elif row in ['Car, as a driver','Car, as a passenger','Taxi / Ride-hailing service','Transit','Motorcycle or scooter']:
        return alt_stat[1]
    else:
        return row

d_analysis['mode_alt'] = d_analysis['mode_alt'].apply(sus_alter)

d_analysis.to_csv(f'{project_folder}/new_data/new_data_3.csv')


In [269]:
# read data
merge_q = pd.read_csv(f'{project_folder}/new_data/new_data_3.csv')

In [270]:
# organize and populate @more_data  dictionary
more_data ={item:[list(DataFrame(merge_q[item].unique()).dropna()[0]),False] for  item in all_vars.values()}

# Code relevant for reindex
# Bicycle attributes
q= 'value_approx'
df_t = DataFrame(merge_q[q].unique()).dropna().sort_values(by=0, ascending=0).reset_index(drop=True)
more_data[q][0] = pd.concat([df_t.iloc[0], df_t.iloc[6], df_t.iloc[4], df_t.iloc[8], df_t.iloc[7], df_t.iloc[5], df_t.iloc[1:4].sort_values(by=0, ascending=1)])[0].to_list()

# Demographic
more_data['education'][0]= ['Some high school or less',
                            'Graduated high school',
                            'Some university',
                            'Associate’s/vocational/technical\u202fdegree',
                            'Bachelor’s degree',
                            'Graduate degree',
                            'I prefer to not answer',
 ]
q = 'age_groups'
more_data[q][0] = DataFrame(merge_q[q].unique()).dropna()[::-1][0].to_list()


more_data['seasons'][0] = range(3)
q= 'income'
df_t = DataFrame(merge_q[q].unique()).dropna().sort_values(by=0, ascending=0).reset_index(drop=True)
more_data[q][0]= pd.concat([df_t.iloc[0], df_t.iloc[6:2:-1], df_t.iloc[8:6:-1], df_t.iloc[9], df_t.iloc[1:3]])[0].to_list()
q = 'nm_bikes'
df_t = DataFrame(merge_q[q].unique()).dropna().sort_values(by=0, ascending=0).reset_index(drop=True)
more_data[q][0]= pd.concat([df_t.iloc[0], df_t.iloc[3], df_t.iloc[1:3], df_t.iloc[5:], df_t.iloc[4]])[0].to_list()

# for the dependent variable post_act:
more_data['post_act'][0]= ['I stopped cycling',
                            'Less often',
                            'About the same / no change',
                            'More often',
]
# In case where only several cols are relevant
more_data['bicycle_type'][0]= more_data['bicycle_type'][1]=['Hybrid/City/Dutch','Mountain','Road','Gravel/cyclocross']


<span style="color: blue;font-size: 50px">Find relationships between independent variables </span>

In [95]:
dependents_list = ['value_approx','income','nm_bikes','age_groups','education']
data_to_exp = merge_q[dependents_list].fillna(-1)

def to_ordinal(col):
    # This function gets a column and return for each value its ordinal values as it stored in @more_data (for irrelevant data return -1)
    return data_to_exp[col.name].apply(lambda x:more_data[col.name][0].index(x) if x not in ['I prefer to not answer','Don’t know/not sure',-1] else -1)
data_as_ordinal = data_to_exp.apply(to_ordinal)


In [88]:
data_as_ordinal

Unnamed: 0,value_approx,income,nm_bikes,age_groups,education
0,3,1,1,7,1
1,4,1,3,7,2
2,4,5,2,7,5
3,2,-1,1,7,5
4,3,5,2,7,5
...,...,...,...,...,...
1816,4,-1,-1,7,-1
1817,1,-1,3,7,5
1818,2,0,2,7,5
1819,4,-1,-1,7,-1


In [89]:

data_list = []
# The code test every pair of independent variables to find the direction of the correlation
for items in [(x, y) for i,x in enumerate(dependents_list) for y in dependents_list[i+1:]]:
    var_1, var_2 = items
    df = data_as_ordinal[[var_1, var_2 ]]
    couple_to_test = df[~(df== -1).any(axis=1)] # remove missing or irrelevant data
    correlation, p_value = stats.spearmanr(couple_to_test[var_1],couple_to_test[var_2]) # use spearman to test the data
    data_list.append([var_1,var_2,correlation, p_value])
sta_spearman_table = DataFrame(data_list,columns=['var_1','var_2','correlation', 'p_value']).sort_values('correlation',ascending=False)
sta_spearman_table

Unnamed: 0,var_1,var_2,correlation,p_value
1,value_approx,nm_bikes,0.299765,1.1755379999999999e-36
6,income,education,0.28199,2.410065e-29
4,income,nm_bikes,0.241739,8.763972e-22
7,nm_bikes,age_groups,0.233628,1.3958950000000001e-22
0,value_approx,income,0.218758,4.6987e-18
5,income,age_groups,0.212985,3.320104e-17
2,value_approx,age_groups,0.191486,1.932273e-16
9,age_groups,education,0.145009,2.140645e-09
8,nm_bikes,education,0.114911,2.286376e-06
3,value_approx,education,0.041716,0.08701408


In [90]:
couple_to_test

Unnamed: 0,age_groups,education
0,7,1
1,7,2
2,7,5
3,7,5
4,7,5
...,...,...
1746,7,4
1751,7,5
1817,7,5
1818,7,5


<span style="color: blue;font-size: 50px">####</span>

In [271]:

class MyData:
    """
   The class organizes and explores the data, allowing for the creation of cross-tabulations.
    """
    def __init__(self, var_0, data, com_data):
        """

        :param var_0: The main variable that should not be replaced frequently throughout the program.
        :param data: Data frame
        :param com_data: provides more information when necessary to create more adaptable analysis
        """
        self.cols_name = [var_0,'']
        self.merge_q= data
        self.more_data = com_data
        self.reindex_rows = self.more_data[var_0][0]

    def explore_data(self,cross_tab= True):
        r"""
        Clean the data and print cross_tab if it is required
        :param cross_tab:
        :return:
        """
        # Based on these variables, the analysis can be employed.
        cross_cols_nm =self.cols_name
        cls_to_use = self.more_data[cross_cols_nm[1]][1]
        reindex_temp = self.more_data[cross_cols_nm[1]][0]
        only_rel_f= self.merge_q[cross_cols_nm].dropna()
        if cross_tab:
            if cls_to_use:
                only_rel_f = only_rel_f[only_rel_f[cross_cols_nm[1]].isin(cls_to_use)]
            # For ordinal categories, reindexing the columns is essential to align them in the required order.
            return only_rel_f[cross_cols_nm[1]].value_counts().reindex(reindex_temp), (pd.crosstab(only_rel_f[cross_cols_nm[0]], only_rel_f[cross_cols_nm[1]], normalize='columns') * 100).astype(int).reindex(columns=reindex_temp,index= self.reindex_rows)
        return only_rel_f

    def change_properties(self,name):
        """
        change the dependent variable and update desired reindex list
        :param name:
        :return:
        """
        self.cols_name[0] = name
        self.reindex_rows = self.more_data[name][0]
        sumy =my_data.merge_q[name].value_counts().sum()
        print((my_data.merge_q[name].value_counts()/sumy*100).apply('{:.0f}%'.format))




In [272]:

dependent_names = list(dependent_vars.values())+['value_approx','income','nm_bikes','age_groups','education']
predictors_names = list(predictors.values())
my_data = MyData(dependent_names[0],merge_q,more_data)
for dep in dependent_names:
    print(f'\n{dep}\n')
    writer = pd.ExcelWriter(f'output_2/data_with_insight/{dep}.xlsx')
    my_data.change_properties(dep)
    for pre in predictors_names+dependent_names:
        # relevant when we analysis the relationship between dependent_vars
        if pre==dep:
            continue
        my_data.cols_name[1] = pre
        value_counts,cross_tab = my_data.explore_data()
        value_counts.to_excel(writer, sheet_name=pre)
        cross_tab.to_excel(writer, sheet_name=pre, startrow=value_counts.shape[0] + 2)
    writer.save()



is_replaced

​Yes, and I replaced it with exactly what was stolen or something more expensive    46%
No                                                                                  31%
Yes, but the replacement was something cheaper                                      24%
Name: is_replaced, dtype: object

mode_alt

not active                 50%
active                     36%
Didn’t make those trips    13%
Don’t know/not sure         2%
Name: mode_alt, dtype: object

post_act

About the same / no change    49%
Less often                    30%
I stopped cycling             15%
More often                     6%
Name: post_act, dtype: object

value_approx

$500-$999                 29%
$1000-$1999               24%
$2000-3999                16%
$250-$499                 13%
$4000-6999                 8%
$7000 or more              5%
Less than $250             4%
Don’t know/not sure        0%
I prefer to not answer     0%
Name: value_approx, dtype: object

income

$200,000 or more pe

<span style="color: purple;font-size: 50px">Phase 4: Apply Chi-Square Test</span>


In [250]:
cols_to_delete = ['I prefer to not answer', 'Don’t know/not sure']
pre_test_data = pd.read_csv(f'{project_folder}/new_data/new_data_3.csv').fillna(-1)[list(all_vars.values())].applymap(lambda x: -1 if x in cols_to_delete + [-1] else x)
data_path = 'output_2/data_with_insight'
class MyTests:
    def __init__(self, col_1:str, col_2:str, rel_data:tuple):
        print(f'\n{col_1}::{col_2}')
        self.col_1 = col_1
        self.col_2 = col_2
        self.fields_to_test= rel_data[1]
        self.rel_data = rel_data

        if 'apply_chi' in rel_data[0]:
            print('apply_chi')
            self.__apply_chi()
        else:
            print('apply_spearmanr')
            self.__apply_spearmanr( rel_data[2])

    def __apply_chi(self):
        """
        apply chi_square test based on the cols and data
        :param new_names: more relevant data to employ when run the test
        :param col_1:
        :param col_2:
        :return:
        """
        # b.	Clean the data
        test_data = pre_test_data[[self.col_1, self.col_2]]
        # In case more data should be remove prior to the analysis
        if isinstance(self.rel_data[-1],list):
            test_data = test_data[~test_data.isin([-1]+self.rel_data[-1]).any(axis=1)]
        else:
            test_data = test_data[~test_data.isin([-1]).any(axis=1)]

        # c.	Create group
        for item in self.fields_to_test.items():
            internal_dict  = item[1]
            var_temp = item[0]
            test_data[var_temp] =  test_data[var_temp].apply(lambda x: internal_dict[x] if x in internal_dict else x)

        # d.	Cross tub
        contingency_table = pd.crosstab(test_data[self.col_1], test_data[self.col_2], normalize='columns') * 100

        # e.	test
        chi2, p, _, _ = stats.chi2_contingency(contingency_table)

        # Print the chi-square test statistic and p-value
        print(contingency_table)
        print('Chi-square test statistic:', chi2)
        print('p-value:', p)

    def __apply_spearmanr(self, more_cols_to_delete):

        # The data to read (the Excel name- @col_1 and sheet name  - @col_2)
        df = pd.read_excel(f'{data_path}/{self.col_1}.xlsx', sheet_name=self.col_2)
        # Obten only the contingency_table from the file, update the cols names and remove irrelevant data
        new_df = df.iloc[df.loc[df[df.columns[0]].isnull()].index.item()+1:]
        new_df.columns= new_df.iloc[0]
        cols_to_delete_temp = [col for col in cols_to_delete if col in new_df.columns] + more_cols_to_delete
        new_df = new_df.reset_index(drop=True).drop(0).drop(columns= cols_to_delete_temp).set_index(var_1)
        # extract the required data for test
        data_1= np.array(range(len(new_df.columns)))
        # the data to test which can be based on one or more fields (if it is list is more than one)
        if isinstance(self.fields_to_test,list):
            data_2 = np.array(new_df.loc[self.fields_to_test].sum())
        else:
            data_2= np.array(new_df.loc[self.fields_to_test])
        # Perform the Cochran-Armitage test of trend
        result = stats.spearmanr(data_1, data_2)

        # Extract the test statistic and p-value
        test_statistic = result.correlation
        p_value = result.pvalue

        # Print the test statistic and p-value
        print("Test Statistic:", test_statistic)
        print("p-value:", p_value)

In [None]:
predictors ={'Q13':'value_approx','Q14':'is_electric','Q15':'bicycle_type','Q18':'is_recover',  'Q28':'seasons', 'Q29':'purpose', 'age_groups': 'age_groups', 'Q35': 'gender', 'Q36': 'income','Q37':'nm_bikes', 'Q38': 'education','country':'country'}
dependent_vars = {'Q25':'is_replaced','Q30':'mode_alt', 'Q31':'post_act'}

In [229]:
pre_test_data['post_act'].value_counts()

About the same / no change    852
Less often                    511
I stopped cycling             263
More often                    104
-1                             91
Name: post_act, dtype: int64

In [266]:
# dependent 3
var_1 ='post_act'

# a.	Define var
var_2 = 'is_recover'
val_1 = 'positive'
val_2 = 'negative'

test_fields= ('apply_chi', {var_1:{'I stopped cycling':val_2, 'Less often':val_2,'About the same / no change':val_1,'More often':val_1}})
res= MyTests(var_1, var_2, test_fields)
var_2 = 'seasons'
test_fields=('apply_chi', {var_1:{'I stopped cycling':val_2, 'Less often':val_2,'About the same / no change':val_1,'More often':val_1}})
res = MyTests(var_1, var_2, test_fields)
var_2 = 'age_groups'
val_1 = 'still cycling'

test_fields=('apply_chi', {var_1:{ 'Less often':val_1,'About the same / no change':val_1,'More often':val_1},var_2:{age:val_2 for age in pre_test_data[var_2]if age not  in ['Young adults (18-24)']}},['Children (<13)','Adolescents (13-17)'])
res = MyTests(var_1, var_2, test_fields)
var_2 = 'income'
test_fields=('spearmanr', 'I stopped cycling',[])
res = MyTests(var_1, var_2, test_fields)
var_2 = 'value_approx'

val_1 = 'no more often'
val_2 = 'not cheap bike'
test_fields=('apply_chi', {var_1:{ 'Less often':val_1,'About the same / no change':val_1,'I stopped cycling':val_1},var_2:{temp:val_2 for temp in pre_test_data[var_2]if temp not  in ['Less than $250']}})
res = MyTests(var_1, var_2, test_fields)
var_2 = 'seasons'
val_1 = 'negative'
test_fields=('apply_chi', {var_1:{ 'Less often':val_1,'I stopped cycling':val_1}},['More often'])
res = MyTests(var_1, var_2, test_fields)
var_2 = 'value_approx'
test_fields=('spearmanr', 'About the same / no change',[])
res = MyTests(var_1, var_2, test_fields)

var_2 = 'nm_bikes'
test_fields=('spearmanr', 'About the same / no change',['Zero'])
res = MyTests(var_1, var_2, test_fields)

var_2 = 'education'
test_fields=('spearmanr', 'About the same / no change',[])
res = MyTests(var_1, var_2, test_fields)


post_act::is_recover
apply_chi
is_recover       No        Yes
post_act                      
negative    47.8738  27.675277
positive    52.1262  72.324723
Chi-square test statistic: 7.840402304773549
p-value: 0.005109123495550274

post_act::seasons
apply_chi
seasons         0.0        1.0        2.0
post_act                                 
negative  82.439024  45.986125  22.121896
positive  17.560976  54.013875  77.878104
Chi-square test statistic: 73.82059854544553
p-value: 9.333847489738014e-17

post_act::age_groups
apply_chi
age_groups         Young adults (18-24)   negative
post_act                                          
I stopped cycling             32.258065  14.374226
still cycling                 67.741935  85.625774
Chi-square test statistic: 7.971714255037737
p-value: 0.004751392107743264

post_act::income
apply_spearmanr
Test Statistic: -0.9221722216732094
p-value: 0.0011108173578061712

post_act::value_approx
apply_chi
value_approx   Less than $250  not cheap bike
post

In [237]:
# dependent 1
var_1 ='is_replaced'

# a.	Define var
var_2 = 'is_recover'
val = 'yes'
test_fields= ('apply_chi', {var_1:{'​Yes, and I replaced it with exactly what was stolen or something more expensive':val, 'Yes, but the replacement was something cheaper':val}})
res= MyTests(var_1, var_2, test_fields)

var_2 = 'seasons'
test_fields=('apply_chi', {var_1:{'​Yes, and I replaced it with exactly what was stolen or something more expensive':val, 'Yes, but the replacement was something cheaper':val}, var_2:{2:1}})
res = MyTests(var_1, var_2, test_fields)


var_2 = 'value_approx'
val_1= 'low'
val_2 = 'high'
test_fields=('apply_chi', {var_1:{'​Yes, and I replaced it with exactly what was stolen or something more expensive':val, 'Yes, but the replacement was something cheaper':'No'}, var_2:{'Less than $250':val_1,'$250-$499':val_1,'$500-$999':val_1,'$1000-$1999':val_1,'$2000-3999':val_1,'$4000-6999':val_2,'$7000 or more':val_2}})
res = MyTests(var_1, var_2, test_fields)

var_2 = 'income'
test_fields=('spearmanr', '​Yes, and I replaced it with exactly what was stolen or something more expensive',[])
res = MyTests(var_1, var_2, test_fields)
var_2 = 'nm_bikes'
test_fields=('spearmanr', '​Yes, and I replaced it with exactly what was stolen or something more expensive',['Zero'])
res = MyTests(var_1, var_2, test_fields)
var_2 = 'education'
test_fields=('spearmanr', '​Yes, and I replaced it with exactly what was stolen or something more expensive',[])
res = MyTests(var_1, var_2, test_fields)
var_2 = 'education'
test_fields=('spearmanr', '​Yes, and I replaced it with exactly what was stolen or something more expensive',[])
res = MyTests(var_1, var_2, test_fields)


is_replaced::is_recover
apply_chi
is_recover          No        Yes
is_replaced                      
No           25.993485  57.246377
yes          74.006515  42.753623
Chi-square test statistic: 18.833794309849555
p-value: 1.4261766558877277e-05

is_replaced::seasons
apply_chi
seasons            0.0        1.0
is_replaced                      
No           68.780488  25.223368
yes          31.219512  74.776632
Chi-square test statistic: 36.35287163814688
p-value: 1.646364575915698e-09

is_replaced::value_approx
apply_chi
value_approx       high        low
is_replaced                       
No            40.909091  56.365963
yes           59.090909  43.634037
Chi-square test statistic: 4.1831290626519895
p-value: 0.040828254116231075

is_replaced::income
apply_spearmanr
[0 1 2 3 4 5 6 7]
Test Statistic: 0.9700772721497397
p-value: 6.548558831120658e-05

is_replaced::nm_bikes
apply_spearmanr
[0 1 2 3 4]
Test Statistic: 0.8999999999999998
p-value: 0.03738607346849874

is_replaced::educ

In [238]:
# Test area

In [244]:
# dependent 3
var_1 ='post_act'

# a.	Define var
var_2 = 'is_recover'
val_1 = 'positive'
val_2 = 'negative'
col_1 = 'post_act'
col_2='seasons'
fields_to_test= {var_1:{'I stopped cycling':val_2, 'Less often':val_2,'About the same / no change':val_1,'More often':val_1}}
test_data = pre_test_data[[col_1, col_2]]
test_data = test_data[~test_data.isin([-1]).any(axis=1)]
test_data

Unnamed: 0,post_act,seasons
0,More often,1.0
1,About the same / no change,2.0
2,About the same / no change,1.0
3,About the same / no change,1.0
4,About the same / no change,2.0
...,...,...
1813,About the same / no change,0.0
1816,About the same / no change,1.0
1817,Less often,2.0
1818,Less often,1.0


In [245]:

# c.	Create group
for item in fields_to_test.items():
    internal_dict  = item[1]
    var_temp = item[0]
    test_data[var_temp] =  test_data[var_temp].apply(lambda x: internal_dict[x] if x in internal_dict else x)
test_data

Unnamed: 0,post_act,seasons
0,positive,1.0
1,positive,2.0
2,positive,1.0
3,positive,1.0
4,positive,2.0
...,...,...
1813,positive,0.0
1816,positive,1.0
1817,negative,2.0
1818,negative,1.0


In [243]:
internal_dict

{'I stopped cycling': 'high',
 'Less often': 'high',
 'About the same / no change': 'low',
 'More often': 'low'}

In [246]:

# d.	Cross tub
contingency_table = pd.crosstab(test_data[col_1], test_data[col_2], normalize='columns') * 100
contingency_table

seasons,0.0,1.0,2.0
post_act,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,82.439024,45.986125,22.121896
positive,17.560976,54.013875,77.878104


In [247]:

# e.	test
chi2, p, _, _ = stats.chi2_contingency(contingency_table)

# Print the chi-square test statistic and p-value
print(contingency_table)
print('Chi-square test statistic:', chi2)
print('p-value:', p)

seasons         0.0        1.0        2.0
post_act                                 
negative  82.439024  45.986125  22.121896
positive  17.560976  54.013875  77.878104
Chi-square test statistic: 73.82059854544553
p-value: 9.333847489738014e-17
