In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import sort
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from IPython.display import clear_output

Using TensorFlow backend.


In [2]:
class myDict:
    def __init__(self,d):
        self.dic=d
    def normalize(self):
        a=np.array(list(self.dic.values()))
        return myDict(dict(zip(self.dic.keys(),(a-a.min())/a.max())))
    def __repr__(self):
        return str(self.dic)
    def __add__(self,other):
        return myDict(dict(pd.Series(self.dic)+pd.Series(other.dic)))
    def __truediv__(self,val):
        return myDict(dict(zip(self.dic.keys(),np.array(list(self.dic.values()))/val)))
    def keys(self):
        return self.dic.keys()
    def values(self):
        return self.dic.values()
    def get_sorted(self):
        a=list(zip(self.dic.keys(),self.dic.values()))
        return myDict(dict(sorted(a,key=lambda e:e[1])))
    
def get_feature_set_for(target_column):
    df=pd.read_csv("features_embedded.csv")
    df['norm_road']=df[['Signal', 'bus_stop', 'Turn','Congestion']].apply(lambda e: 1 if e[0]==e[1]==e[2]==e[3]==0 else 0,axis=1)

    labels=df[[target_column]].values
    features=df.drop(columns=['norm_road','Signal', 'bus_stop', 'Turn','Congestion'])

    oversample = SMOTE()
    features,labels = oversample.fit_resample(features,labels)
    return features,labels

In [3]:
def Feature_Importance(target_class,run_cases):
    feat_columns=\
    ['next_stop_distance', 'total_waiting_time', 'wifi_count', 'honks',
       'rsi', 'zone_highway', 'zone_market_place', 'zone_normal_city',
       'time_level_1', 'time_level_2', 'time_level_3', 'time_level_4',
       'Population_density_dense', 'Population_density_medium',
       'Population_density_sparse', 'Weekend/day_Week-day',
       'Weekend/day_Week-end']
    
    Sum=myDict(dict(zip(feat_columns,[0]*17))) #zero_initilization

    for _ in range(run_cases):
        X,y=get_feature_set_for(target_class)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)

        s=MinMaxScaler()
        X_train=s.fit_transform(X_train)
        X_test=s.transform(X_test)
        
        print('{}: on run {}'.format(target_class,_))
        model = XGBClassifier()
        model.fit(X_train, y_train)

        Sum+=myDict(dict(zip(X.columns,model.feature_importances_)))

    avg=Sum/run_cases
    avg=avg.get_sorted()

    fig=plt.figure(figsize=(10,5))
    fig.suptitle(target_class,size=16)
    ax=fig.add_subplot(111)
    ax.barh(list(avg.keys()),list(avg.values()),color=sns.color_palette('hot_r',17))
    ax.set_xlabel('Importance')
    plt.close()
    
    return fig,avg

In [9]:
fig_bus_stop,bus_stop_fi=Feature_Importance('bus_stop',100)
fig_norm_road,norm_road_fi=Feature_Importance('norm_road',100)
fig_Signal,Signal_fi=Feature_Importance('Signal',100)
fig_Turn,Turn_fi=Feature_Importance('Turn',100)
fig_Congestion,Congestion_fi=Feature_Importance('Congestion',100)
clear_output()

bus_stop: on run 0


  y = column_or_1d(y, warn=True)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

# Feature Importance individual

In [6]:
fig_bus_stop

NameError: name 'fig_bus_stop' is not defined

In [None]:
fig_norm_road

In [None]:
fig_Signal

In [None]:
fig_Turn

In [None]:
fig_Congestion

# Average Feature Importance

In [8]:
avg_importance=\
((bus_stop_fi.normalize()+\
norm_road_fi.normalize()+\
Signal_fi.normalize()+\
Turn_fi.normalize()+\
Congestion_fi.normalize())/5).get_sorted()

NameError: name 'bus_stop_fi' is not defined

In [None]:
fig=plt.figure(figsize=(7,5))
fig.suptitle('AVG',size=16)
ax=fig.add_subplot(111)
ax.barh(list(avg_importance.keys()),list(avg_importance.values()),color=sns.color_palette('hot_r',17))
plt.show()

In [None]:
#DONE

In [7]:
def check_feature_accuracy(target_class,importance_dict=avg_importance):
    acc_mean_l=[]
    acc_std_l=[]

    for feat in range(1,17+1):
        acc_l=[]
        taken_feat=list(importance_dict.keys())[-feat:]
        for _ in range(10):

            X,y=get_feature_set_for(target_class)
            X_train, X_test, y_train, y_test = train_test_split(X[taken_feat], y, test_size=0.33, random_state=7)

            s=MinMaxScaler()
            X_train=s.fit_transform(X_train)
            X_test=s.transform(X_test)

            model = XGBClassifier()
            model.fit(X_train, y_train)

            acc=(model.predict(X_test)==y_test).mean()
            acc_l.append(acc)
        acc_mean_l.append(np.mean(acc_l))
        acc_std_l.append(np.std(acc_l))

    return acc_mean_l,acc_std_l

NameError: name 'avg_importance' is not defined

# For Average importance

In [5]:
bus_stopacc_mean_l,bus_stopacc_std_l=check_feature_accuracy('bus_stop')
norm_roadacc_mean_l,norm_roadacc_std_l=check_feature_accuracy('norm_road')
Signalacc_mean_l,Signalacc_std_l=check_feature_accuracy('Signal')
Turnacc_mean_l,Turnacc_std_l=check_feature_accuracy('Turn')
Congestionacc_mean_l,Congestionacc_std_l=check_feature_accuracy('Congestion')

NameError: name 'check_feature_accuracy' is not defined

In [None]:
fig=plt.figure(figsize=(10,7))
ax=fig.add_subplot(111)
ax.errorbar(list(map(str,range(1,17+1))),bus_stopacc_mean_l,yerr=bus_stopacc_std_l,label='bus_stop')
ax.errorbar(list(map(str,range(1,17+1))),norm_roadacc_mean_l,yerr=norm_roadacc_std_l,label='norm_road')
ax.errorbar(list(map(str,range(1,17+1))),Signalacc_mean_l,yerr=Signalacc_std_l,label='Signal')
ax.errorbar(list(map(str,range(1,17+1))),Turnacc_mean_l,yerr=Turnacc_std_l,label='Turn')
ax.errorbar(list(map(str,range(1,17+1))),Congestionacc_mean_l,yerr=Congestionacc_std_l,label='Congestion')
ax.set_xlabel('High-->Low importance')
ax.set_ylabel('Testing_Accuracy')
ax.legend(loc='upper left')
plt.show()

In [None]:
taken_list=list(avg_importance.keys())[-13:]

In [10]:
all_list=['next_stop_distance', 'total_waiting_time', 'wifi_count', 'honks',
'rsi', 'zone_highway', 'zone_market_place', 'zone_normal_city',
'time_level_1', 'time_level_2', 'time_level_3', 'time_level_4',
'Population_density_dense', 'Population_density_medium',
'Population_density_sparse', 'Weekend/day_Week-day',
'Weekend/day_Week-end']

In [None]:
def get_feature_number(taken_list):
    l=[]
    for n in taken_list:
        l.append(all_list.index(n))
    return l

In [None]:
get_feature_number(taken_list)

In [None]:
#NICE

# For only bus stop importance

In [None]:
bus_stopacc_mean_l_c,bus_stopacc_std_l_c=check_feature_accuracy('bus_stop',bus_stop_fi)

In [None]:
fig=plt.figure(figsize=(10,7))
ax=fig.add_subplot(111)
ax.errorbar(list(map(str,range(1,17+1))),bus_stopacc_mean_l_c,yerr=bus_stopacc_std_l_c,label='bus_stop')
ax.set_xlabel('High-->Low importance')
ax.set_ylabel('Testing_Accuracy')
ax.legend(loc='upper left')
plt.show()

In [None]:
l=get_feature_number(list(bus_stop_fi.keys())[-11:])

In [None]:
final=[]
for i in l:
    final.append(all_list[i])
print(final)

# For norm_road only

In [None]:
norm_roadacc_mean_l_c,norm_roadacc_std_l_c=check_feature_accuracy('norm_road',norm_road_fi)

In [None]:
fig=plt.figure(figsize=(10,7))
ax=fig.add_subplot(111)
ax.errorbar(list(map(str,range(1,17+1))),norm_roadacc_mean_l_c,yerr=norm_roadacc_std_l_c,label='norm_road')
ax.set_xlabel('High-->Low importance')
ax.set_ylabel('Testing_Accuracy')
ax.legend(loc='upper left')
plt.show()

In [None]:
l=get_feature_number(list(norm_road_fi.keys())[-14:])

In [None]:
final=[]
for i in l:
    final.append(all_list[i])
print(final)

# For Signal only

In [None]:
Signalacc_mean_l_c,Signalacc_std_l_c=check_feature_accuracy('Signal',Signal_fi)

In [None]:
fig=plt.figure(figsize=(10,7))
ax=fig.add_subplot(111)
ax.errorbar(list(map(str,range(1,17+1))),Signalacc_mean_l_c,yerr=Signalacc_std_l_c,label='Signal')
ax.set_xlabel('High-->Low importance')
ax.set_ylabel('Testing_Accuracy')
ax.legend(loc='upper left')
plt.show()

In [None]:
l=get_feature_number(list(Signal_fi.keys())[-17:])

In [15]:
final=[]
for i in [16, 13, 9, 1, 10, 5, 0, 6, 14, 15, 7]:
    final.append(all_list[i])
print(final)

['Weekend/day_Week-end', 'Population_density_medium', 'time_level_2', 'total_waiting_time', 'time_level_3', 'zone_highway', 'next_stop_distance', 'zone_market_place', 'Population_density_sparse', 'Weekend/day_Week-day', 'zone_normal_city']


# For Turn

In [None]:
Turnacc_mean_l_c,Turnacc_std_l_c=check_feature_accuracy('Turn',Turn_fi)

In [None]:
fig=plt.figure(figsize=(10,7))
ax=fig.add_subplot(111)
ax.errorbar(list(map(str,range(1,17+1))),Turnacc_mean_l_c,yerr=Turnacc_std_l_c,label='Turn')
ax.set_xlabel('High-->Low importance')
ax.set_ylabel('Testing_Accuracy')
ax.legend(loc='upper left')
plt.show()

In [None]:
get_feature_number(list(Turn_fi.keys())[-15:])

# For Congestion

In [None]:
Congestionacc_mean_l_c,Congestionacc_std_l_c=check_feature_accuracy('Congestion',Congestion_fi)

In [None]:
fig=plt.figure(figsize=(10,7))
ax=fig.add_subplot(111)
ax.errorbar(list(map(str,range(1,17+1))),Congestionacc_mean_l_c,yerr=Congestionacc_std_l_c,label='Congestion')
ax.set_xlabel('High-->Low importance')
ax.set_ylabel('Testing_Accuracy')
ax.legend(loc='upper left')
plt.show()

In [None]:
l=get_feature_number(list(Congestion_fi.keys())[-14:])

In [11]:
final=[]
for i in [0, 2, 11, 8, 10, 9, 1, 12, 13, 16, 6, 15, 5, 7]:
    final.append(all_list[i])
print(final)

['next_stop_distance', 'wifi_count', 'time_level_4', 'time_level_1', 'time_level_3', 'time_level_2', 'total_waiting_time', 'Population_density_dense', 'Population_density_medium', 'Weekend/day_Week-end', 'zone_market_place', 'Weekend/day_Week-day', 'zone_highway', 'zone_normal_city']
