In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df_initial = pd.read_csv('/home/david/Desktop/uber/train_revised.csv',low_memory=False)


In [3]:
df_initial.head()

Unnamed: 0,ride_id,seat_number,payment_method,payment_receipt,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,1442,15A,Mpesa,UZUEHCBUSO,17-10-17,7:15,Migori,Nairobi,Bus,49
1,5437,14A,Mpesa,TIHLBUSGTE,19-11-17,7:12,Migori,Nairobi,Bus,49
2,5710,8B,Mpesa,EQX8Q5G19O,26-11-17,7:05,Keroka,Nairobi,Bus,49
3,5777,19A,Mpesa,SGP18CL0ME,27-11-17,7:10,Homa Bay,Nairobi,Bus,49
4,5778,11A,Mpesa,BM97HFRGL9,27-11-17,7:12,Migori,Nairobi,Bus,49


# Create target variable

In [4]:
df_new = df_initial.drop(['seat_number', 'payment_method', 'payment_receipt', 'travel_to'], axis=1)


In [5]:
ride_id_dict = {} 
for ride_id in df_initial["ride_id"]:
    if not ride_id in ride_id_dict:
        ride_id_dict[ride_id] = 1
    else:
        ride_id_dict[ride_id] += 1  


## drop duplicate


In [6]:
df_new.drop_duplicates(inplace=True)
df_new.reset_index(drop= True, inplace=True)

In [7]:
df_new["number_of_tickets"]= np.zeros(len(df_new))

In [8]:
for i in range(len(df_new)):
    ride_id = df_new.loc[i]["ride_id"]
    df_new.at[i,"number_of_tickets"] = ride_id_dict[ride_id]

In [9]:
df_new.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,Bus,49,1.0
1,5437,19-11-17,7:12,Migori,Bus,49,1.0
2,5710,26-11-17,7:05,Keroka,Bus,49,1.0
3,5777,27-11-17,7:10,Homa Bay,Bus,49,5.0
4,5778,27-11-17,7:12,Migori,Bus,49,31.0


In [10]:
df_new.to_csv('train_aggregated.csv', index=False)

In [11]:
df_train_set=pd.read_csv('train_aggregated.csv', low_memory=False)

In [12]:
df_train_set.drop(['ride_id'], axis=1, inplace=True)

## Pre process travel date

In [13]:
df_train_set["travel_date"]=pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)
df_train_set["day_of_week"]= df_train_set["travel_date"].dt.dayofweek
df_train_set["year"]= df_train_set["travel_date"].dt.year
df_train_set["month"]= df_train_set["travel_date"].dt.month

In [14]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,day_of_week,year,month
0,2017-10-17,7:15,Migori,Bus,49,1.0,1,2017,10
1,2017-11-19,7:12,Migori,Bus,49,1.0,6,2017,11
2,2017-11-26,7:05,Keroka,Bus,49,1.0,6,2017,11
3,2017-11-27,7:10,Homa Bay,Bus,49,5.0,0,2017,11
4,2017-11-27,7:12,Migori,Bus,49,31.0,0,2017,11


In [15]:
xternaldata=[df_train_set]

d={'Migori':370.9,'Keroka':279.8,'Kisii':305.5,'Homa Bay':305.5,'Keumbu':294.0,
        'Rongo':330.3,'Kijauri':276.6,'Oyugis':331.1,'Awendo':349.5,
        'Sirare':391.9,'Nyachenge':322.8,'Kehancha':377.5,
        'Kendu Bay':367.5,'Sori':392,'Rodi':349.1,'Mbita':399.4,
        'Ndhiwa':369.6}

for dataset in xternaldata:
    dataset['avg_distance']= dataset['travel_from'].map(d)

## pre process car type

In [16]:
df_train_set["car_type"]=pd.Categorical(df_train_set["car_type"])
car_type_categorical= df_train_set.car_type.cat.categories
df_train_set["car_type"]= df_train_set.car_type.cat.codes

In [17]:
df_train_set["car_type"]=df_train_set["car_type"].replace(to_replace=0, value= 49)
df_train_set["car_type"]=df_train_set["car_type"].replace(to_replace=1, value= 11)

In [18]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,day_of_week,year,month,avg_distance
0,2017-10-17,7:15,Migori,49,49,1.0,1,2017,10,370.9
1,2017-11-19,7:12,Migori,49,49,1.0,6,2017,11,370.9
2,2017-11-26,7:05,Keroka,49,49,1.0,6,2017,11,279.8
3,2017-11-27,7:10,Homa Bay,49,49,5.0,0,2017,11,305.5
4,2017-11-27,7:12,Migori,49,49,31.0,0,2017,11,370.9


## pre process travel From

In [19]:
df_train_set["travel_from"]=pd.Categorical(df_train_set["travel_from"])
travel_from_categorical= df_train_set.travel_from.cat.categories
df_train_set["travel_from"]= df_train_set.travel_from.cat.codes

In [20]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,day_of_week,year,month,avg_distance
0,2017-10-17,7:15,9,49,49,1.0,1,2017,10,370.9
1,2017-11-19,7:12,9,49,49,1.0,6,2017,11,370.9
2,2017-11-26,7:05,4,49,49,1.0,6,2017,11,279.8
3,2017-11-27,7:10,1,49,49,5.0,0,2017,11,305.5
4,2017-11-27,7:12,9,49,49,31.0,0,2017,11,370.9


## pre process travel time 

In [21]:
df_train_set["travel_time"]=df_train_set["travel_time"].str.split(':').apply(lambda x: int (x[0])* 60 + int(x[1]))

In [22]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,day_of_week,year,month,avg_distance
0,2017-10-17,435,9,49,49,1.0,1,2017,10,370.9
1,2017-11-19,432,9,49,49,1.0,6,2017,11,370.9
2,2017-11-26,425,4,49,49,1.0,6,2017,11,279.8
3,2017-11-27,430,1,49,49,5.0,0,2017,11,305.5
4,2017-11-27,432,9,49,49,31.0,0,2017,11,370.9


In [23]:
df_train_set.drop('travel_date', axis=1,inplace=True) 

In [24]:
df_train_set.head()

Unnamed: 0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,day_of_week,year,month,avg_distance
0,435,9,49,49,1.0,1,2017,10,370.9
1,432,9,49,49,1.0,6,2017,11,370.9
2,425,4,49,49,1.0,6,2017,11,279.8
3,430,1,49,49,5.0,0,2017,11,305.5
4,432,9,49,49,31.0,0,2017,11,370.9


# Train dataset

In [25]:
X= df_train_set.drop(["number_of_tickets"], axis=1)
y=df_train_set.number_of_tickets

In [26]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error,mean_absolute_error


In [27]:
tree= RandomForestRegressor(max_depth=3, random_state=42)

In [28]:
tree.fit(X,y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

## Random Forest

In [29]:
model=RandomForestRegressor(bootstrap=100,criterion='mae', n_jobs=-1,random_state=10,n_estimators=10, 
                            max_features=0.8,verbose=1,min_samples_leaf=8,max_depth=10,min_samples_split=45,
                            oob_score=True)

In [30]:
model.fit(X,y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.8s finished
  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(bootstrap=100, criterion='mae', max_depth=10,
           max_features=0.8, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=45,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=True, random_state=10, verbose=1, warm_start=False)

In [31]:
predict_train_set= model.predict(X)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [32]:
print(mean_absolute_error(predict_train_set,y))

3.3159945591294604


# Test pre process

In [33]:
test= pd.read_csv('/home/david/Desktop/uber/test_questions.csv', low_memory=False)

In [34]:
test.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,247,2018-05-07,07:06,Kisii,Nairobi,Bus,49
1,256,2018-05-06,11:08,Kisii,Nairobi,shuttle,11
2,275,2018-05-04,05:00,Kisii,Nairobi,shuttle,11
3,285,2018-05-04,09:10,Kisii,Nairobi,shuttle,11
4,286,2018-05-04,09:20,Kisii,Nairobi,shuttle,11


In [35]:
ride_id_dict = {} 
for ride_id in test["ride_id"]:
    if not ride_id in ride_id_dict:
        ride_id_dict[ride_id] = 1
    else:
        ride_id_dict[ride_id] += 1  

## drop duplicate


In [36]:
test.drop_duplicates(inplace=True)
test.reset_index(drop= True, inplace=True)

In [37]:
test.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,247,2018-05-07,07:06,Kisii,Nairobi,Bus,49
1,256,2018-05-06,11:08,Kisii,Nairobi,shuttle,11
2,275,2018-05-04,05:00,Kisii,Nairobi,shuttle,11
3,285,2018-05-04,09:10,Kisii,Nairobi,shuttle,11
4,286,2018-05-04,09:20,Kisii,Nairobi,shuttle,11


## Pre process travel date

In [38]:
test["travel_date"]=pd.to_datetime(test["travel_date"],infer_datetime_format=True)
test["day_of_week"]= test["travel_date"].dt.dayofweek
test["year"]= test["travel_date"].dt.year
test["month"]= test["travel_date"].dt.month

In [39]:
test.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity,day_of_week,year,month
0,247,2018-05-07,07:06,Kisii,Nairobi,Bus,49,0,2018,5
1,256,2018-05-06,11:08,Kisii,Nairobi,shuttle,11,6,2018,5
2,275,2018-05-04,05:00,Kisii,Nairobi,shuttle,11,4,2018,5
3,285,2018-05-04,09:10,Kisii,Nairobi,shuttle,11,4,2018,5
4,286,2018-05-04,09:20,Kisii,Nairobi,shuttle,11,4,2018,5


In [40]:
xternaldata=[test]

d={'Migori':370.9,'Keroka':279.8,'Kisii':305.5,'Homa Bay':305.5,'Keumbu':294.0,
        'Rongo':330.3,'Kijauri':276.6,'Oyugis':331.1,'Awendo':349.5,
        'Sirare':391.9,'Nyachenge':322.8,'Kehancha':377.5,
        'Kendu Bay':367.5,'Sori':392,'Rodi':349.1,'Mbita':399.4,
        'Ndhiwa':369.6}

for dataset in xternaldata:
    dataset['avg_distance']= dataset['travel_from'].map(d)

In [41]:
test.drop(['travel_to'], axis=1,inplace=True)

## pre process car type

In [42]:
test["car_type"]=pd.Categorical(test["car_type"])
car_type_categorical= test.car_type.cat.categories
test["car_type"]= test.car_type.cat.codes

In [43]:
test["car_type"]=test["car_type"].replace(to_replace=0, value= 49)
test["car_type"]=test["car_type"].replace(to_replace=1, value= 11)

## Pre process travel from

In [44]:
test["travel_from"]=pd.Categorical(test["travel_from"])
travel_from_categorical= test.travel_from.cat.categories
test["travel_from"]= test.travel_from.cat.codes

## pre process travel time

In [45]:
test["travel_time"]=test["travel_time"].str.split(':').apply(lambda x: int (x[0])* 60 + int(x[1]))

In [46]:
test.drop('travel_date',axis=1,inplace=True)

In [47]:
test.head()

Unnamed: 0,ride_id,travel_time,travel_from,car_type,max_capacity,day_of_week,year,month,avg_distance
0,247,426,5,49,49,0,2018,5,305.5
1,256,668,5,11,11,6,2018,5,305.5
2,275,300,5,11,11,4,2018,5,305.5
3,285,550,5,11,11,4,2018,5,305.5
4,286,560,5,11,11,4,2018,5,305.5


# predict on test

In [48]:
X_test=test.drop(['ride_id'], axis=1)

In [49]:
test_predictions= model.predict(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished


In [50]:
d={'ride_id':test["ride_id"],'number_of_ticket':test_predictions.round()}

In [51]:
pred=pd.DataFrame(data=d)

In [52]:
pred=pred[['ride_id','number_of_ticket']]

In [53]:
pred.to_csv('david.csv', index=False )

In [54]:
pred.head()

Unnamed: 0,ride_id,number_of_ticket
0,247,4.0
1,256,8.0
2,275,1.0
3,285,9.0
4,286,9.0


# Voting 

In [55]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import sklearn.linear_model
import sklearn.ensemble
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=43)

In [57]:
deci_model=DecisionTreeClassifier()


In [59]:
deci_model.fit(X_train,y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [60]:
estimator.append(('cart',deci_model))

NameError: name 'estimator' is not defined