In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Import Dataset

In [None]:
dataset = pd.read_csv("divvytrips.csv")
dataset.head()

## Detect and Delete Missing Values

In [None]:
sum(dataset.isnull().values.ravel())

In [None]:
dataset.info()
#gender and birthyear have missing values

In [None]:
dataset = dataset.dropna()

In [None]:
dataset.info()

## Reduce Dataset to 35000 rows 

In [None]:
dataset = dataset.drop(dataset.index[range(35000,1234638)])
dataset.info()

## Graph 1: Birth Year Frequency Distribution

In [None]:
plt.subplots(figsize=(20,8))
plt.hist(dataset['birthyear'], bins=np.arange(dataset['birthyear'].min(), dataset['birthyear'].max()+1))
plt.title('Birth Year Frequency Distribution',size=15)
plt.xlabel('Birth Years', size=15)
plt.ylabel('Frequency', size=15)
plt.show()

## Graph 2: Male to Female Ratio Pie Chart

In [None]:
sums = dataset.gender.groupby(dataset.gender).count()

In [None]:
sums

In [None]:
plt.axis('equal');
plt.pie(sums, labels=sums.index, autopct='%1.1f%%')
plt.title('Male to Female Ratio',size=15)
plt.show()

## Graph 3: Genderwise Average Trip Duration

In [None]:
avgdist = dataset['tripduration'].groupby(dataset.gender).aggregate(np.mean).plot('bar')
plt.title('Genderwise Average Trip Duration')
plt.xlabel('Gender', size=15)
plt.ylabel('Average Trip Duration',size=15)
plt.xticks(size=10)
plt.show()

## Graph 4: Age Frequency Distribution

In [None]:
dataset['Age'] = 2017 - dataset['birthyear']

In [None]:
plt.subplots(figsize=(25,8))
agedist = dataset['tripduration'].groupby(dataset.Age).aggregate(np.mean).plot('bar')
plt.title('Age vs Average Travel Duration',size=15)
plt.xlabel('Age', size=15)
plt.ylabel('Average Travel Duration',size=15)
plt.show()

## Graph 5: Scatterplot for Tripduration

In [None]:
plt.subplots(figsize=(20,8))
plt.scatter(dataset['trip_id'],dataset['tripduration'])
plt.xticks([])
plt.ylabel('Trip Duration',size=15)
plt.title('Trip Duration Scatter Plot',size=15)
plt.show()

## Graph 6: Top 5 Destinations 

In [None]:
from dateutil.parser import parse

In [None]:
m = []
h = []
d = []

for i in dataset.start_time:
    m.append(parse(i).month)
    h.append(parse(i).hour)
    d.append(parse(i).day)

In [None]:
dataset['start_month'] = m
dataset['start_day'] = d
dataset['start_hour'] = h

In [None]:
dataset.head()

### Non-numerical to Numerical Data

In [None]:
#Function to convert Non-numerical to Numerical Data

def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {} 
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x=0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x = x + 1
            
            df[column] = list(map(convert_to_int, df[column]))
    
    return df

df = dataset
df = handle_non_numerical_data(df)

In [None]:
df.head()

In [None]:
x = df.drop(['tripduration','end_time','birthyear','trip_id','from_station_name','to_station_name','start_time','bikeid'], axis=1)
y = df.tripduration

In [None]:
x.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.40, random_state=123)

## Write CSV File

In [None]:
df.to_csv('finaldata.csv')

## Random Forest Regressor

In [None]:
rm_model = RandomForestRegressor()

In [None]:
rm_model.fit(x_train,y_train)
y_pred = rm_model.predict(x_test)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print("MSE: ", mean_squared_error(y_test,y_pred))
print("\nMAE: ", mean_absolute_error(y_test,y_pred))

## Feature Importance and Selection

In [None]:
importances = pd.DataFrame({'feature':x.columns,'importance':np.round(rm_model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
 
print(importances*100,"%")
importances.plot.bar()

In [None]:
#Eliminating User Type and Start Month
x = x.drop(['usertype','start_month'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=123)

## OOB SCORE v/s Number of Estimators

In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR
n_estimators = [10,20,50,80,100,200]
oob_errors = []
for n in n_estimators:
    ranfor = RFR(n_estimators=n, criterion="mse", min_samples_split=2, min_samples_leaf=1,
                      max_features="auto", bootstrap=True, oob_score=True, random_state=123, verbose=1)
    ranfor.fit(x_train,y_train)
    y_pred = ranfor.predict(x_test)
    oob_errors.append(ranfor.oob_score_)
    
from sklearn.metrics import mean_squared_error,mean_absolute_error
print("OOB Error: ", oob_errors)
print("\n MAE: ", mean_absolute_error(y_test,y_pred))

In [None]:
# Plot OOB vs n_estimators
plt.plot(n_estimators,oob_errors)
plt.title("OOB Score vs Number of Estimators")
plt.xlabel("Number of estimators")
plt.ylabel("OOB Score")
plt.show()

## GridsearchCV for RandomForestRegressor

In [None]:
from sklearn.model_selection import GridSearchCV
rm_grid = GridSearchCV(rm_model,
                   {'n_estimators':[15,35,50], 
                    'max_depth':[6,8,10], 
                    'min_samples_split':[5,6,7]}, verbose=1)

In [None]:
rm_grid.fit(x_train,y_train)
print(rm_grid.best_score_)
print(rm_grid.best_params_)

## Optimal RandomForestRegressor