In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import pickle
# import warnings
# warnings.filterwarnings("ignore")

# Reading Data

In [2]:
df_chunk = pd.read_csv('train.csv', chunksize = 500000, low_memory = False)
chunks = []

In [3]:
invalid = [0]*500000
coord_list = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'fare_amount']
def mark_invalid(chunk):
    for c in coord_list:
        for i in chunk.index:
            if(c == "pickup_longitude" or c == "dropoff_longitude"):
                if(chunk[c][i].astype(float) > -73.699215 or chunk[c][i].astype(float) < -74.257159):
#                     chunk = chunk.replace(chunk[c][i],np.nan)
#                     chunk = chunk.drop([i])
                    invalid[i%500000] = 1
            elif (c == "pickup_latitude" or c == "dropoff_latitude"):
                if(chunk[c][i].astype(float) > 40.915568 or chunk[c][i].astype(float) < 40.495992):
#                     chunk = chunk.replace(chunk[c][i],np.nan)
#                     chunk = chunk.drop([i])
                    invalid[i%500000] = 1
            elif(c == "fare_amount"):
                if(chunk[c][i] >= 200 or chunk[c][i] <= 0):
#                     chunk = chunk.drop([i])
                    invalid[i%500000] = 1
    return chunk

invalid_test = [0]*1000000
coord_list_test = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
def mark_invalid_test(chunk):
    for c in coord_list_test:
        for i in chunk.index:
            if(c == "pickup_longitude" or c == "dropoff_longitude"):
                if(chunk[c][i].astype(float) > -73.699215 or chunk[c][i].astype(float) < -74.257159):
#                     chunk = chunk.replace(chunk[c][i],np.nan)
#                     chunk = chunk.drop([i])
                    invalid_test[i%1000000] = 1
            elif (c == "pickup_latitude" or c == "dropoff_latitude"):
                if(chunk[c][i].astype(float) > 40.915568 or chunk[c][i].astype(float) < 40.495992):
#                     chunk = chunk.replace(chunk[c][i],np.nan)
#                     chunk = chunk.drop([i])
                    invalid_test[i%1000000] = 1
    return chunk
        

In [4]:
def mark_outlier(chunk, data_1):
    outliers_indices=[]
    threshold = 3
    mean_1 = np.mean(data_1)
    std_1 = np.std(data_1)
    
#     length = len(data_1)
    for i in chunk.index:
        z_score= (data_1[i] - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers_indices.append(i)
    for i in outliers_indices:
#         chunk = chunk.drop([i])
        chunk['invalid'][i] = 1
    return chunk


# print(len(detect_outlier(chunks[1]['fare_amount'])))

In [5]:
def split_datetime(chunk):
    hours = []
    mins = []
    secs = []
    years = []
    months = []
    days = []
    length = len(chunk['pickup_longitude'])
    
    for i in chunk.index:
        years.append(int(chunk['pickup_datetime'][i][0:4]))
        months.append(int(chunk['pickup_datetime'][i][5:7]) - 1) # 1 is subtracted to aid in days from jan 1st calculations
        days.append(int(chunk['pickup_datetime'][i][8:10]))
        hours.append(int(chunk['pickup_datetime'][i][11:13]))
        mins.append(int(chunk['pickup_datetime'][i][14:16]))
        secs.append(int(chunk['pickup_datetime'][i][17:19]))

    chunk['years'] = years
    chunk['months'] = months
    chunk['days'] = days
    chunk['hours'] = hours
    chunk['mins'] = mins
    chunk['secs'] = secs
    
    return chunk

In [6]:
def modify_datetime(chunk):
    chunk['secs_past_midnight'] = (chunk['hours']*3600) + (chunk['mins']*60) + (chunk['secs'])
    chunk['sin_spm'] = np.sin(2*np.pi*(chunk['secs_past_midnight']/86400))
    chunk['cos_spm'] = np.cos(2*np.pi*(chunk['secs_past_midnight']/86400))
    chunk['days_past_jan1'] = (chunk['months']*30) + (chunk['days'])
    chunk['sin_dpj'] = np.sin(2*np.pi*(chunk['days_past_jan1']/365))
    chunk['cos_dpj'] = np.cos(2*np.pi*(chunk['days_past_jan1']/365))
    
    return chunk

In [7]:
def split_data(chunk):
    y = chunk['fare_amount']
    X = pd.DataFrame(chunk)
    X = X.drop(['fare_amount','key','pickup_datetime', 'years', 'months', 'days', 'hours', 'mins', 'secs', 'secs_past_midnight', 'days_past_jan1'], axis = 1)
#     X = StandardScaler().fit_transform(X)
    return (X, y)

In [8]:
def fit_model_rmse(X, y, linreg = LinearRegression()):
    X_train, X_test, y_train, y_test = train_test_split(X, y)
#     linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    return (linreg, rmse)

# X1_train, X1_test, y1_train, y1_test = train_test_split(features, target, random_state=1)

In [9]:
def save_model(model, filename): 
#     filename = 'model.sav'
    pickle.dump(model, open(filename, 'wb'))

In [10]:
def load_model(filename): 
    model = pickle.load(open(filename, 'rb'))
    return model

In [11]:
def req_columns_test(chunk):
    temp = pd.DataFrame(chunk)
    temp = temp.drop(['key','pickup_datetime', 'years', 'months', 'days', 'hours', 'mins', 'secs', 'secs_past_midnight', 'days_past_jan1'], axis = 1)
    return temp

# Reading chunks and applying functions

In [12]:
count = 0
for chunk in df_chunk:
    chunk = pd.DataFrame(chunk)
    if (count == 0):
        model = LinearRegression()
    else:
        model = load_model("model.sav")
        
    chunk = mark_invalid(chunk)
    chunk['invalid'] = invalid
    chunk.dropna(inplace = True)
    chunk = split_datetime(chunk)
    chunk = modify_datetime(chunk)
    chunk = mark_outlier(chunk, chunk['fare_amount'])
    chunk['manhattan_dist'] = abs(chunk['pickup_latitude']-chunk['dropoff_latitude']) + abs(chunk['pickup_longitude']-chunk['dropoff_longitude'])
    (X1, y1) = split_data(chunk)
    X1 = pd.DataFrame(X1)
    model, rmse = fit_model_rmse(X1, y1, model)
    save_model(model, "model.sav")
    print(rmse)
    
    if (count == 0):
        chunks.append(pd.DataFrame(chunk))
    count += 1
    if(count == 2):
        break

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


7.338667972014402


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


8.689230615870558


In [13]:
count = 0
test_chunk = pd.read_csv('test.csv', chunksize = 1000000, low_memory = False)

In [14]:

# print(pd.DataFrame(test_chunk).shape)
test_chunks = []
final_dfs = []
for chunk in test_chunk:
    chunk = pd.DataFrame(chunk)
    if(count == 0):
        model = load_model("model.sav")
    chunk = mark_invalid_test(chunk)
    chunk['invalid'] = invalid_test
    chunk = split_datetime(chunk)
    chunk = modify_datetime(chunk)
    chunk['manhattan_dist'] = abs(chunk['pickup_latitude']-chunk['dropoff_latitude']) + abs(chunk['pickup_longitude']-chunk['dropoff_longitude'])
    if(count == 0):
        test_chunks.append(pd.DataFrame(chunk))
    print(chunk.shape)
    req_df = req_columns_test(chunk)
    print(req_df.shape)
    y_pred = model.predict(req_df)
    df = pd.concat([chunk['key'], y_pred], axis = 1)
    final_dfs.append(df)
    
    count += 1
    break
print(count)

(1000000, 21)
(1000000, 11)


TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [None]:
test_chunks[0].head()

In [None]:
# test_chunks[0].describe()

In [None]:
# test_chunks[0].isnull().sum()

# Visualization plots 

In [None]:
fig, ax = plt.subplots()
print(chunks[0]["pickup_longitude"].plot.hist(ax = ax, title="pickup longitude",bottom=1, bins=25))
ax.set_yscale('log')

In [None]:
fig, ax = plt.subplots()
print(chunks[0]["pickup_latitude"].plot.hist(ax = ax, title="pickup latitude",bottom=1, bins=25))
ax.set_yscale('log')

In [None]:
fig, ax = plt.subplots()
print(chunks[0]["dropoff_longitude"].plot.hist(ax = ax, title="dropoff longitude",bottom=1, bins=25))
ax.set_yscale('log')

In [None]:
fig, ax = plt.subplots()
print(chunks[0]["dropoff_latitude"].plot.hist(ax = ax, title="dropoff latitude",bottom=1, bins=25))
ax.set_yscale('log')

In [None]:
fig, ax = plt.subplots()
print(chunks[0]["passenger_count"].plot.hist(ax = ax, title="passenger count",bottom=1, bins=25))
ax.set_yscale('log')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, sharex = True, sharey = True)
ax1.scatter(chunks[0]["pickup_latitude"],chunks[0]["fare_amount"])
# ax1.xlabel("pickup_latitude")
# ax1.ylabel("fare_amount")
ax2.scatter(chunks[0]["pickup_longitude"],chunks[0]["fare_amount"])
# ax2.xlabel("pickup_longitude")
# ax2.ylabel("fare_amount")
plt.show()

In [None]:
plt.scatter(chunks[0]["dropoff_latitude"],chunks[0]["fare_amount"])
plt.xlabel("dropoff_latitude")
plt.ylabel("fare_amount")
plt.show()

In [None]:
plt.scatter(chunks[0]["dropoff_longitude"],chunks[0]["fare_amount"])
plt.xlabel("dropoff_longitude")
plt.ylabel("fare_amount")
plt.show()

In [None]:
plt.scatter(chunks[0]["passenger_count"],chunks[0]["fare_amount"])
plt.xlabel("passenger_count")
plt.ylabel("fare_amount")
plt.show()

# Detecting Outliers

In [None]:
plt.boxplot(chunks[0].fare_amount)

In [None]:
chunks[0].to_csv('preprocessed.csv')
chunk1 = pd.read_csv('preprocessed.csv')
chunks.append(chunk1)
chunks[1].head()
chunks[0].describe()

In [None]:
# indices = detect_outlier(chunks[1]['fare_amount'])
# for i in indices:
#     chunks[1] = chunks[1].drop([i])
#     print(i)

In [None]:
chunks[1].to_csv('preprocessed1.csv')
chunks.append(pd.read_csv('preprocessed1.csv'))

# Restricting coordinates to NYC

All coordinates outside NYC are directly dropped

# Removing all rows with null values

In [None]:
# type(chunks[0])
# print(chunks[0].isnull().sum())
# chunks[0].dropna(inplace = True)
# print(chunks[0].isnull().sum())
# chunks[0]['pickup_datetime'][0]
# chunks[0].describe()
# type(chunks[0]['dropoff_longitude'][161652])

# Splitting pickup date time 

# Generating secs after midnight and days past jan 1st

In [None]:
# chunks[0] = chunks[0].drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1)
chunks[1].head()

In [None]:
# chunks[0]['manhattan_dist'] = abs(chunks[0]['pickup_latitude']-chunks[0]['dropoff_latitude']) + abs(chunks[0]['pickup_longitude']-chunks[0]['dropoff_longitude'])

In [None]:
chunks[0].head()

In [None]:
# features = chunks[0].drop(['fare_amount','key','pickup_datetime', 'years', 'months', 'days', 'hours', 'mins', 'secs', 'secs_past_midnight', 'days_past_jan1'], axis = 1)
# target = chunks[0]['fare_amount']
# features = StandardScaler().fit_transform(features)

# pca = PCA(n_components=6)
# principalComponents = pca.fit_transform(features)
# principalDf = pd.DataFrame(data = principalComponents, columns = ['PrincipalC1', 'PrincipalC2', 'PrincipalC3','PrincipalC4', 'PrincipalC5', 'PrincipalC6'])
# finalDf = pd.concat([principalDf, target], axis=1)
# finalDf.head()
# chunks[0].head()

# Generating X and y

# Test train split & Linear Regression & RMSE

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# print(X1_train.shape)
# print(y1_train.shape)
# print(X1_test.shape)
# print(y1_test.shape)

# Applying Linear Regression

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
# linreg1 = LinearRegression()
# linreg1.fit(X1_train, y1_train)

## Predicting outputs

In [None]:
y_pred = linreg.predict(X_test)
# y1_pred = linreg1.predict(X1_test)

## Generating RMSE

In [None]:
print("Without PCA: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# print("With PCA: ", np.sqrt(metrics.mean_squared_error(y1_test, y1_pred)))