In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# import lightgbm as lgb
from sklearn.metrics import r2_score

In [2]:
spark_train = spark.read.format("csv").option("inferSchema", "true").option("header","true").load("dbfs:/mnt/data/train.csv").limit(100000)

In [3]:
train = spark_train.toPandas()

In [4]:
print('Old size: {}'.format(len(train)))
train = train[train.fare_amount>=0]
print('New size: {}'.format(len(train)))

print(train.isnull().sum())
print('Old size: {}'.format(len(train)))
train = train.dropna(how = 'any', axis = 'rows')
print('New size: {}'.format(len(train)))

In [5]:
train['key'] = pd.to_datetime(train['key'])
train['pickup_datetime']  = pd.to_datetime(train['pickup_datetime'])
train.info()

In [6]:
#extract year
train["year"]  = pd.DatetimeIndex(train["pickup_datetime"]).year
#extract month
train["month"] = pd.DatetimeIndex(train["pickup_datetime"]).month
#extract day 
train["day"] = train["pickup_datetime"].dt.day
#extract hour
train["hour"] = train["pickup_datetime"].dt.hour 

In [7]:
train.columns

In [8]:
#replace 0's in coordinates with null values
coord = ['pickup_longitude','pickup_latitude', 
         'dropoff_longitude', 'dropoff_latitude']

# for i in coord:
#   data = data[(data[i] > data[i].quantile(.1)) & (data[i] < data[i].quantile(.8))]

for i in coord:
    train[i] = train[i].replace(0,np.nan)
    train = train[train[i].notnull()]

#radius of earth in kilometers
R = 6373.0

pickup_lat  = np.radians(train["pickup_latitude"])
pickup_lon  = np.radians(train["pickup_longitude"])
dropoff_lat = np.radians(train["dropoff_latitude"])
dropoff_lon = np.radians(train["dropoff_longitude"])

dist_lon = dropoff_lon - pickup_lon
dist_lat = dropoff_lat - pickup_lat

#Formula
a = (np.sin(dist_lat/2))**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * (np.sin(dist_lon/2))**2 
c = 2 * np.arctan2( np.sqrt(a), np.sqrt(1-a) ) 
d = R * c #(where R is the radius of the Earth)

train["trip_distance_km"] = d

train[coord + ["trip_distance_km"]].head(7)

In [9]:
train = train[(train["trip_distance_km"] < 200) | (train["fare_amount"].isnull())].copy()

In [10]:
m_train = train[train["fare_amount"].isnull() == False].copy()
m_train.shape

In [11]:
m_train = m_train.drop(['key','pickup_datetime'], axis = 1)

### Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train_ = m_train.drop("fare_amount", axis=1)
y_train_ = m_train['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(x_train_, y_train_, test_size=0.30, random_state=101)

### Random Forest model

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
SEED = 100

- model init (we can change model hyper parameters here):
- y = a1*x1 + a2*x2 + a0
- fit (we use X_train and y_train and expect the best model to be created):
- y = 5*x1 - 7*x2 + 574
- predictions (we use x_test to get predictions):
- yModel = predictions = 5*x1_test - 7*x2_test + 574
- predictions == y_test ? (we verify if our predictions by models are equal to the fact y values)
- calculate quality metrics

In [19]:
rfr_model = RandomForestRegressor(n_estimators=600, random_state=SEED)

In [20]:
rfr_model.fit(X_train,y_train)

In [21]:
predictions_rfr = rfr_model.predict(X_test)

In [22]:
from sklearn.metrics import r2_score

In [23]:
r2_score(y_test, predictions_rfr)

In [24]:
import pickle
with open("rfr_model.txt","wb") as f:
    pickle.dump(str(rfr_model),f)

### BOOSTING USING LGB

In [26]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'verbose': 0,
        'num_leaves': 50, # 31
        'learning_rate': 0.1, # 0.05
        'max_depth': -1,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'reg_aplha': 1,
        'reg_lambda': 0.001,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 80, # 10
        'scale_pos_weight': 1,
        'seed': SEED
    }

In [27]:
pred_test_y = np.zeros(X_test.shape[0])
pred_test_y.shape


In [28]:
import lightgbm as lgb
train_set = lgb.Dataset(X_train, y_train, silent=True)
train_set

In [29]:
model = lgb.train(params, train_set = train_set, num_boost_round=300)
print(model)

In [30]:
predictions_lgb = model.predict(X_test, num_iteration = model.best_iteration)

In [31]:
r2_score(y_test, predictions_lgb)

###BOOSTING USING XGBM

In [33]:
import xgboost as xgb 

In [34]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [35]:
dtrain

In [36]:
#set parameters for xgboost
params = {'max_depth':7,
          'eta':1,
          'silent':1,
          'objective':'reg:linear',
          'eval_metric':'rmse',
          'learning_rate':0.05,
          'seed': SEED
         }
num_rounds = 50

In [37]:
xb = xgb.train(params, dtrain, num_rounds)

In [38]:
y_pred_xgb = xb.predict(dtest)
print(y_pred_xgb)

In [39]:
r2_score(y_test, y_pred_xgb)

In [40]:
mmls = X_train.copy()
mmls['target'] = y_train
mmls_train = spark.createDataFrame(mmls)
train_s, test_s = mmls_train.randomSplit([0.85, 0.15], seed=SEED)

In [41]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "year", "month",
               "day", "hour", "trip_distance_km"],
    outputCol="features")

In [42]:
from mmlspark import LightGBMRegressor
from pyspark.ml import Pipeline
lgbm = LightGBMRegressor(objective='regression',
                          alpha=0.8,
                          learningRate=0.1,
                          numIterations=120,
                          numLeaves=31).setLabelCol("target")
pipeline = Pipeline(stages=[assembler, lgbm])
lgbm_model = pipeline.fit(train_s)

In [43]:
lgbm_model_pred = lgbm_model.transform(test_s)

In [44]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import RegressionMetrics
lgbm_model_pred = lgbm_model.transform(test_s)

In [45]:
%r
lrModel <- glm(price ~ ., data = trainingData, family = "gaussian")

# Print a summary of the trained model
summary(lrModel)

In [46]:
metrics_df = pd.DataFrame()
metrics_df['prediction'] = lgbm_model_pred.select('prediction').toPandas()
metrics_df['test'] = lgbm_model_pred.select('target').toPandas()

In [47]:
r2_score(metrics_df['test'], metrics_df['prediction'])

In [48]:
lgbm_model.write().overwrite().save("/modelMain")