In [None]:
import numpy as np
import pandas as pd
from ggplot import *
import matplotlib  
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

In [None]:
#data = pd.read_csv('yellow_tripdata_2016-01.csv')
data = pd.read_csv('../data/sample.csv')

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.isnull().sum(axis=0)

In [None]:
data["pickup_day"] = pd.to_numeric(data["tpep_pickup_datetime"].str.slice(8, 10))
data["pickup_hour"] = pd.to_numeric(data["tpep_pickup_datetime"].str.slice(11, 13))
data["pick_format"] = pd.to_datetime(data["tpep_pickup_datetime"])
data["dropoff_format"] = pd.to_datetime(data["tpep_dropoff_datetime"])
data["trip_time"] = (data.dropoff_format-data.pick_format).astype('timedelta64[s]')/3600
data['average_speed'] = data['trip_distance']/data['trip_time']
data['cost_per_mile'] = data['fare_amount']/data['trip_distance']
data['tip_percentage'] = 100*data['tip_amount'] / data['fare_amount']
data.loc[data['pickup_day'] % 7 == 1, 'week_day'] = 'Fri'
data.loc[data['pickup_day'] % 7 == 2, 'week_day'] = 'Sat'
data.loc[data['pickup_day'] % 7 == 3, 'week_day'] = 'Sun'
data.loc[data['pickup_day'] % 7 == 4, 'week_day'] = 'Mon'
data.loc[data['pickup_day'] % 7 == 5, 'week_day'] = 'Tue'
data.loc[data['pickup_day'] % 7 == 6, 'week_day'] = 'Wed'
data.loc[data['pickup_day'] % 7 == 0, 'week_day'] = 'Thu'

In [None]:
replacements = {
   'payment_type': {
      1: 'Credit Card',
      2: 'Cash',
      3: 'No charge',
      4: 'Dispute'},
    'VendorID': {
      1: 'Creative Mobile Technologies',
      2: 'VeriFone Inc.'},
    'RatecodeID': {
      1: 'Standard rate',
      2: 'JFK',
      3: 'Newark',
      4: 'Nassau or Westchester',
      5: 'Negotiated fare',
      6: 'Group ride',
      99: 'NA'}
}

data.replace(replacements, inplace=True)

In [None]:
data_clean = data.loc[(data['trip_distance'] > 0) & (data['trip_distance'] <= 15.6)]
data_clean = data_clean.loc[(data_clean['trip_time'] > 0) & (data_clean['trip_time'] <= 0.9)]
data_clean = data_clean.loc[(data_clean['total_amount'] >= 0) & (data_clean['total_amount'] <= 70)]
data_clean = data_clean.loc[(data_clean['fare_amount'] >= 0) & (data_clean['fare_amount'] <= 52)]
data_clean = data_clean.loc[(data_clean['tip_amount'] >= 0) & (data_clean['tip_amount'] <= 11.67)]
data_clean = data_clean.loc[(data_clean['pickup_longitude'] >= -75.0) & (data_clean['pickup_longitude'] < -72)]
data_clean = data_clean.loc[(data_clean['pickup_latitude'] >= 39.0) & (data_clean['pickup_latitude'] <= 43.0)]
data_clean = data_clean.loc[(data_clean['average_speed'] > 0) & (data_clean['average_speed'] <= 100.0)]
data_clean = data_clean.loc[(data_clean['cost_per_mile'] > 0) & (data_clean['cost_per_mile'] <= 15.0)]

In [None]:
data_clean.describe(percentiles=[.01, .25, .5, .75, .99])

In [None]:
corrmat = data_clean[['trip_distance','trip_time','total_amount','cost_per_mile', 'average_speed']].corr()
sns.heatmap(corrmat, annot=True, square = True)
plt.show()

### GEDA Univariado

In [None]:
ggplot(aes(x='VendorID'), data=data_clean) + geom_bar() + theme_bw()

In [None]:
ggplot(aes(x='passenger_count'), data=data_clean) + geom_bar() + theme_bw()

In [None]:
ggplot(aes(x='pickup_hour'), data=data_clean) + geom_bar() + theme_bw()

In [None]:
ggplot(aes(x='week_day'), data=data_clean) + geom_bar() + theme_bw()

In [None]:
ggplot(aes(x='RatecodeID'), data=data_clean) + geom_bar() + theme_bw()

In [None]:
ggplot(aes(x='payment_type'), data=data_clean) + geom_bar() + theme_bw()

In [None]:
ggplot(aes(x='trip_distance'), data=data_clean) + geom_histogram(binwidth=1) + theme_bw()

In [None]:
ggplot(aes(x='trip_time'), data=data_clean) + geom_histogram(binwidth=1.0/(0.9*60.0)) + theme_bw()

In [None]:
ggplot(aes(x='total_amount'), data=data_clean) + geom_histogram(binwidth=1) + theme_bw()

In [None]:
ggplot(aes(x='fare_amount'), data=data_clean) + geom_histogram(binwidth=1) + theme_bw()

In [None]:
ggplot(aes(x='tip_amount'), data=data_clean) + geom_histogram(binwidth=0.5) + theme_bw()

In [None]:
ggplot(aes(x='cost_per_mile'), data=data_clean) + geom_histogram(binwidth=0.5) + theme_bw()

In [None]:
ggplot(aes(x='average_speed'), data=data_clean) + geom_histogram(binwidth=1) + theme_bw()

In [None]:
ggplot(aes(x='tip_percentage'), data=data_clean.loc[(data_clean['tip_percentage'] < 100) & (data_clean['payment_type'] == 'Credit Card')]) + geom_histogram(binwidth=1) + theme_bw()

### GEDA Bivariado

In [None]:
ggplot(aes(x ='trip_distance', y = 'trip_time'), data = data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='trip_time', y = 'fare_amount'), data = data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='trip_distance', y = 'fare_amount'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='average_speed', y = 'fare_amount'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='fare_amount', y = 'tip_amount'), data =data_clean.loc[data_clean['payment_type'] == 'Credit Card']) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='fare_amount', y = 'total_amount'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='trip_distance', y = 'tip_amount'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='average_speed', y = 'tip_amount'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='trip_distance', y = 'average_speed'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='average_speed', y = 'cost_per_mile'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='pickup_latitude', y = 'pickup_longitude'), data =data_clean) + geom_point() + theme_bw()

In [None]:
ggplot(aes(x ='payment_type', y = 'total_amount'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='payment_type', y = 'fare_amount'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='VendorID', y = 'fare_amount'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='VendorID', y = 'trip_distance'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='passenger_count', y = 'trip_distance'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='passenger_count', y = 'total_amount'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='passenger_count', y = 'fare_amount'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='RatecodeID', y = 'total_amount'), data =data_clean) + geom_boxplot() + theme_bw()

In [None]:
ggplot(aes(x ='RatecodeID', y = 'trip_distance'), data =data_clean) + geom_boxplot() + theme_bw()

### Otros analisis

In [None]:
means = data_clean[['pickup_hour','average_speed']].groupby(['pickup_hour']).mean().reset_index()
ggplot(aes(x ='pickup_hour', weight = 'average_speed'), data = means) + geom_bar() + theme_bw()

In [None]:
means = data_clean[['average_speed','week_day']].groupby(['week_day']).mean().reset_index()
ggplot(aes(x ='week_day', weight = 'average_speed'), data = means) + geom_bar() + theme_bw()

In [None]:
means = data_clean[['pickup_hour','week_day','average_speed']].groupby(['pickup_hour','week_day']).mean().reset_index()
ggplot(aes(x ='pickup_hour', weight = 'average_speed'), data = means) + geom_bar() + theme_bw() + facet_grid('week_day')

In [None]:
means = data_clean[['pickup_hour','cost_per_mile']].groupby(['pickup_hour']).mean().reset_index()
ggplot(aes(x ='pickup_hour', weight = 'cost_per_mile'), data = means) + geom_bar() + theme_bw()

In [None]:
means = data_clean[['cost_per_mile','week_day']].groupby(['week_day']).mean().reset_index()
ggplot(aes(x ='week_day', weight = 'cost_per_mile'), data = means) + geom_bar() + theme_bw()

In [None]:
means = data_clean[['pickup_hour','week_day','cost_per_mile']].groupby(['pickup_hour','week_day']).mean().reset_index()
ggplot(aes(x ='pickup_hour', weight = 'cost_per_mile'), data = means) + geom_bar() + theme_bw() + facet_grid('week_day')

### Espacial

In [None]:
%matplotlib inline 

pd.options.display.mpl_style = 'default'
new_style = {'grid': False}
matplotlib.rc('axes', **new_style)
rcParams['figure.figsize'] = (5, 5)
rcParams['figure.dpi'] = 120

P=data_clean.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',color='white', xlim=(-74.06,-73.77),ylim=(40.61, 40.91),s=.02,alpha=.6)
P.set_axis_bgcolor('black')

In [None]:
%matplotlib inline 

pd.options.display.mpl_style = 'default'
new_style = {'grid': False}
matplotlib.rc('axes', **new_style)
rcParams['figure.figsize'] = (5, 5)
rcParams['figure.dpi'] = 120

P=data_clean.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',c='trip_distance', cmap='autumn', xlim=(-74.06,-73.77),ylim=(40.61, 40.91),s=.02,alpha=.6)
P.set_axis_bgcolor('black')

In [None]:
%matplotlib inline 

pd.options.display.mpl_style = 'default'
new_style = {'grid': False}
matplotlib.rc('axes', **new_style)
rcParams['figure.figsize'] = (5, 5)
rcParams['figure.dpi'] = 120

P=data_clean.loc[(data_clean['average_speed'] > 0) & (data_clean['average_speed'] <= 30.0)].plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',c='average_speed', cmap='autumn', xlim=(-74.06,-73.77),ylim=(40.61, 40.91),s=.02,alpha=.6)
P.set_axis_bgcolor('black')

In [None]:
%matplotlib inline 

pd.options.display.mpl_style = 'default'
new_style = {'grid': False}
matplotlib.rc('axes', **new_style)
rcParams['figure.figsize'] = (5, 5)
rcParams['figure.dpi'] = 120

P=data_clean.plot(kind='scatter', x='pickup_longitude', y='pickup_latitude',c='cost_per_mile', cmap='autumn', xlim=(-74.06,-73.77),ylim=(40.61, 40.91),s=.02,alpha=.6)
P.set_axis_bgcolor('black')

### Prediccion de tips

In [None]:
# Paquetes para pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import linear_model

# Paquetes para curva ROC y precision_recall
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score

# Paquete para AUC
from sklearn.metrics import roc_auc_score

In [None]:
data_tips = data_clean.loc[data_clean['payment_type'] == 'Credit Card']

data_tips['high_tip'] =(data_tips['tip_percentage'] > 22.08).astype(int)

columns = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'VendorID', 'payment_type', 'store_and_fwd_flag', 'fare_amount', 'tip_amount', 'tip_percentage', 'total_amount', 'cost_per_mile', 'mta_tax', 'improvement_surcharge', 'pick_format', 'dropoff_format', 'Unnamed: 0','dropoff_longitude','dropoff_latitude', 'trip_time','tolls_amount','pickup_day']
data_tips.drop(columns, inplace=True, axis=1)
data_tips.head()

In [None]:
data_tips = data_tips.round({'pickup_longitude': 2, 'pickup_latitude': 2})

data_tips['pickup_hour'] = data_tips['pickup_hour'].astype('category')
data_tips['pickup_latitude'] = data_tips['pickup_latitude'].astype('category')
data_tips['pickup_longitude'] = data_tips['pickup_longitude'].astype('category')

columns = ['RatecodeID','week_day', 'pickup_hour','pickup_longitude', 'pickup_latitude']
df1 = pd.get_dummies(data_tips.loc[:,columns])
df2 = data_tips.loc[:,set(data_tips.columns) - set(columns)]
data_tips = pd.concat([df1, df2], axis=1)

In [None]:
data_tips['trip_distance'] = (data_tips['trip_distance']-data_tips['trip_distance'].mean())/data_tips['trip_distance'].std()

In [None]:
features = data_tips.drop('high_tip', axis=1).columns

X_train = np.array(data_tips.loc[data_clean['pickup_day']<=22].drop('high_tip', axis=1))
y_train = np.array(data_tips['high_tip'].loc[data_clean['pickup_day']<=22])
X_test = np.array(data_tips.loc[data_clean['pickup_day']>22].drop('high_tip', axis=1))
y_test = np.array(data_tips['high_tip'].loc[data_clean['pickup_day']>22])
#X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=120652, test_size=0.3)

In [None]:
rf = RandomForestClassifier(n_estimators = 10, min_samples_leaf=50, random_state=120652)
rf.fit(X_train, y_train)

In [None]:
predicted = rf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print(accuracy)

In [None]:
# Parámetros a utilizar con RandomForest
params_rf = dict(RF__n_estimators= [10,20,50],
                       RF__min_samples_leaf=[10,30,50])
params_lr = dict(LR__C= [0.0001, 0.001, 0.01])

In [None]:
# La función de magic_loop recibe como parámetros los pipes con cada modelo que se probará, los parámetros para los modelos y los datos
def magic_loop(models_to_run, clfs, grids, X, y):
    results_df = pd.DataFrame()
    for n in range(1, 2):
        # For para iterar a lo largo de los pipes que haya recibido
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            # Grid search para encontrar los parámetros óptimos para cada modelo
            pipe = Pipeline([(models_to_run[index], clf)])
            grid = GridSearchCV(pipe, param_grid = grids[index], cv=5, verbose=1, scoring='f1', n_jobs=-1)
            grid.fit(X,y)
            # Se agregan los resultados de cada grid search a un data frame
            results_df=results_df.append(grid.grid_scores_)
        # Se ordenan los resultados de acuerdo al mean validation score    
        results_df=results_df.sort_values(by=['mean_validation_score'], ascending=False)
    return results_df

In [None]:
models_to_run = ['LR','RF']
classifiers = {'LR':linear_model.LogisticRegression(),
              'RF':RandomForestClassifier()}

grids = [params_lr, params_rf]
m_loop = magic_loop(models_to_run, classifiers, grids, X_train, y_train)

In [None]:
m_loop

In [None]:
best_params=m_loop.iloc[0,0]
best_params

In [None]:
best_model = RandomForestClassifier(n_estimators=best_params['RF__n_estimators'] , 
                                   min_samples_leaf=best_params['RF__min_samples_leaf'])

best_model.fit(X_train, y_train)

In [None]:
preds_test = best_model.predict_proba(X_test)[:,1]
roc = roc_curve(y_test, preds_test,pos_label=1)

plt.plot(roc[0],roc[1], label="ROC test")
plt.title('ROC curve')
plt.plot( [0,1],[0,1],label="clasificador aleatorio")
plt.legend(loc="lower right")
plt.show()

In [None]:
predicted = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print(accuracy)

In [None]:
roc_auc_score(y_test, preds_test)

In [None]:
list(zip(features, best_model.feature_importances_))

In [None]:
best_lr = linear_model.LogisticRegression(C=0.001)

best_lr.fit(X_train, y_train)

In [None]:
coefs = best_lr.coef_
features = np.asarray(features)
features.shape = ((118,1))
coefs.shape = ((118,1))
coefs = np.hstack((features,coefs))
coefs =pd.DataFrame(data=coefs,columns = ['feature','coefficient'])
pd.options.display.max_rows = 999
coefs