In [None]:
import csv
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Question 5: Logistic Regression

## (a)

In [None]:
def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

def hypothesis(X, beta):
  f = np.dot(X, beta.T)
  h = sigmoid(f)
  return h

def costFunction(X, beta, y):
  h_x = hypothesis(X, beta)
  J = np.multiply(y, np.log(h_x)) + np.multiply((1 - y), np.log(1 - h_x))
  J = -1 * J
  J = J.sum() / J.shape[0]

  return J

def gradient(X, beta, y):

  diff = y - hypothesis(X, beta)
  grad = np.dot(diff.T, X)
  return grad

def gradientDescent(X, beta, y, alpha=0.1, min_e=0.001, max_iters=100):

  cost = costFunction(X, beta, y)
  iterations = 0

  for i in range(max_iters):
    iterations = i + 1
    cost_old = cost
    grad = gradient(X, beta, y)
    beta = beta + alpha * grad
    cost = costFunction(X, beta, y)

    # print(cost_old - cost)
    if(cost_old - cost) < 0.0001:
      break

  return beta, iterations

def classify(X, beta):
  probabilities = hypothesis(X, beta)
  pred_y = np.where(probabilities >= 0.5, 1, 0)
  return pred_y


##(b)

Storing the dataset

In [None]:
X = np.array([
              [1, 0.346, 0.780],
              [1, 0.303, 0.439],
              [1, 0.358, 0.729],
              [1, 0.602, 0.863],
              [1, 0.790, 0.753],
              [1, 0.611, 0.965]])

y = np.array([0, 0, 0, 1, 1, 1])
y = y.reshape(y.shape[0], 1)

beta = np.array([-1, 1.5, 0.5])
beta = beta.reshape(1, beta.shape[0])

X_test = np.array([
              [1, 0.959, 0.382],
              [1, 0.750, 0.306],
              [1, 0.395, 0.760],
              [1, 0.823, 0.764],
              [1, 0.761, 0.874],
              [1, 0.844, 0.435]])
y_test = np.array([0, 0, 0, 1, 1, 1])
y_test = y_test.reshape(y_test.shape[0], 1)



### i. 

The logistic model uses probabilities to assign a class y to each sample. The cross-entropy function is as follows:

$$
H(P, Q) = - \sum_{samples} P(x) \times log_2(Q(x))
$$
Here, $P(x)$ and $Q(x)$ represent the actual label and the calculated label of one sample.

### ii.

In [None]:
# train
beta, iterations = gradientDescent(X, beta, y, alpha=0.01, min_e=0.001, max_iters=1)

print("Final weights:", beta)
print("Iterations:", iterations)

# classifying the test set
y_pred = classify(X_test, beta)

# correct classifications
print("Correctly predicted labels:", np.sum(y_test == y_pred))

results = (y_test == y_pred)
accuracy = np.sum(y_test == y_pred)/results.shape[0]
print("Accuracy: %.4f" % accuracy)

Final weights: [[-1.00189976  1.50321052  0.5011812 ]]
Iterations: 1
Correctly predicted labels: 4
Accuracy: 0.6667


### iii.

In [None]:
# train
beta, iterations = gradientDescent(X, beta, y, alpha=0.01, min_e=0.001)

print("Final weights:", beta)
print("Iterations:", iterations)


# classifying the test set
y_pred = classify(X_test, beta)

# correct classifications
print("Correctly predicted labels:", np.sum(y_test == y_pred))

# results = (y_test == y_pred)
# accuracy = np.sum(y_test == y_pred)/results.shape[0]
# print("Accuracy: %.4f" % accuracy)


from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score


accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average=None)
precision = precision_score(y_test, y_pred, average=None)
print("Accuracy %.4f" % accuracy)
print("Recall %.4f" % recall[1])
print("Precision %.4f" % precision[1])

Final weights: [[-1.21075903  1.80665776  0.59948195]]
Iterations: 100
Correctly predicted labels: 4
Accuracy 0.6667
Recall 1.0000
Precision 0.6000


# Question 6: Kaggle - Taxi Fare Prediction

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn import ensemble
from sklearn.neighbors import KNeighborsRegressor


import pandas as pd

data_train_path = "/content/drive/MyDrive/Acad_Stuff/Course Work/Semester 5/FoML/Ass4/FOML_Assgn4_dataset/train.csv"
data_test_path = "/content/drive/MyDrive/Acad_Stuff/Course Work/Semester 5/FoML/Ass4/FOML_Assgn4_dataset/test.csv"

# data_train_path = "train.csv"
# data_test_path = "test.csv"

In [3]:
# read the train and test sets
train_data = pd.read_csv(data_train_path, nrows=1000000)
test_data = pd.read_csv(data_test_path)

In [4]:
# convert date_time and key to a numerical format
train_data['pickup_datetime'] =  pd.to_datetime(train_data['pickup_datetime'])
test_data['pickup_datetime'] =  pd.to_datetime(test_data['pickup_datetime'])

key = test_data.loc[:, 'key']

train_data.drop('key', axis='columns', inplace=True)
test_data.drop('key', axis='columns', inplace=True)

In [5]:
print("Train data:", train_data.shape)
print(train_data.isnull().sum())

print("Test data:", train_data.shape)
print(test_data.isnull().sum())

# removing rows with any NaN values from the train set
train_data = train_data.drop(train_data[train_data.isnull().any(1)].index, axis = 0)

# discarding rows where the fare is negative
train_data = train_data.drop(train_data[train_data['fare_amount'] < 0].index, axis = 0)

# checking number of passengers vs fares
print(train_data['passenger_count'].value_counts())
print(test_data['passenger_count'].value_counts())
# discarding rows where passenger_count == 208 and 0
train_data = train_data.drop(train_data[train_data['passenger_count'] == 0].index, axis = 0)
train_data = train_data.drop(train_data[train_data['passenger_count'] == 208].index, axis = 0)

# on manually observing the data, it appears as though taxis charge extra during the night
# thus using the pickup_datetime column to create a extra_charge columns
# charge extra for cabs between 5 pm to 5 am
train_data['extra_charge'] = 0
test_data['extra_charge'] = 0


train_data.loc[train_data['pickup_datetime'].dt.time >  pd.to_datetime('17:00:00').time(), 'extra_charge'] = 1
train_data.loc[train_data['pickup_datetime'].dt.time <= pd.to_datetime('8:00:00').time(), 'extra_charge'] = 1
train_data.drop('pickup_datetime', axis='columns', inplace=True)

test_data.loc[test_data['pickup_datetime'].dt.time >  pd.to_datetime('17:00:00').time(), 'extra_charge'] = 1
test_data.loc[test_data['pickup_datetime'].dt.time <= pd.to_datetime('8:00:00').time(), 'extra_charge'] = 1
test_data.drop('pickup_datetime', axis='columns', inplace=True)


print("Train data:", train_data.shape)
print(train_data.isnull().sum())

print("Test data:", train_data.shape)
print(test_data.isnull().sum())

train_data_np = train_data.to_numpy()
test_data_np = test_data.to_numpy()

Train data: (1000000, 7)
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    10
dropoff_latitude     10
passenger_count       0
dtype: int64
Test data: (1000000, 7)
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64
1      691316
2      148102
5       70593
3       43762
4       21428
6       21195
0        3555
208         1
Name: passenger_count, dtype: int64
1    6914
2    1474
5     696
3     447
4     206
6     177
Name: passenger_count, dtype: int64
Train data: (996396, 7)
fare_amount          0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
extra_charge         0
dtype: int64
Test data: (996396, 7)
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
extra_charge         0
dtype: int64


In [6]:
X = train_data_np[:, 1:]
y = train_data_np[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

### Gradient Boosting

In [48]:
# train vs val
params = {
    "n_estimators": 100,
    "max_depth": 8,
    "min_samples_split": 5,
    "learning_rate": 0.08,
    "loss": "squared_error",
}

reg_GB = ensemble.GradientBoostingRegressor(**params)
reg_GB.fit(X_train, y_train)
rmse_gb = mean_squared_error(y_test, reg_GB.predict(X_test), squared=False)
print("RSME:", rmse_gb)


RSME: 4.612397306472591


In [17]:
# entire train set
params = {
    "n_estimators": 100,
    "max_depth": 8,
    "min_samples_split": 5,
    "learning_rate": 0.1,
    "loss": "squared_error",
}

reg_GB = ensemble.GradientBoostingRegressor(**params)
reg_GB.fit(X, y)
final_y_gb = reg_GB.predict(test_data_np)

final_y_gb = np.c_[key, final_y_gb]
final_y_gb = np.r_[[['key', 'fare_amount']], final_y_gb]


fname = 'final_pred_GB_reg.csv'
with open(fname, "w") as f:
    writer = csv.writer(f)
    writer.writerows(final_y_gb)

### Random Forest

In [11]:
# train vs val
reg_RF = ensemble.RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0, max_features="sqrt", max_samples=600000)
reg_RF.fit(X_train, y_train)

rmse_RF = mean_squared_error(y_test, reg_RF.predict(X_test), squared=False)
print("RSME:", rmse_RF)

RSME: 5.371848256554083


In [14]:
# entire train set
reg_RF = ensemble.RandomForestRegressor(n_estimators=100, random_state=0, max_features="sqrt", max_samples=600000)
reg_RF.fit(X, y)

reg_RF.fit(X, y)
final_y_rf = reg_RF.predict(test_data_np)

final_y_rf = np.c_[key, final_y_rf]
final_y_rf = np.r_[[['key', 'fare_amount']], final_y_rf]

fname = 'final_pred_RF_reg.csv'
with open(fname, "w") as f:
    writer = csv.writer(f)
    writer.writerows(final_y_rf)

## Report

### Best Scores:
* Gradient Boosting: 3.74619
  - n_estimators: 100
  - max_depth: 8
  - min_samples_split: 5
  - learning_rate: 0.1
  - loss: "squared_error
* Random Forest: 3.83735
  - n_estimators: 100
  - max_features: "sqrt"
  - max_samples: 600000
  - random_state: 0

### Preprocessing

The following steps were performed during preprocessing:
* converted the pickup_datetime column to a date time format
  - used that to compute the column "extra_charges"
  - if a taxi is booked between 5 pm to 8 am (i.e., after/before the 9 - 5 workday), the passengers get charged extra
* dropped the key and pickup_datetime columns
* removed rows with negative fare amounts
* removed rows which had 0 passangers, and one row with 208 passangers which was clearly an outlier

### Other models
In addition to Gradient Boosting and Random Forests, I also tried out KNN (5-NN and 10-NN). The RSME was a little higher than those obtained from the andom Forest Regressor.

Gradient boosting and Random Forests worked better than the KNN model because KNN was very prone to overfitting due to the number of neighbours I chose: KNN gave very low RSME scores on the train set however that score increased when it came to the actual test set.