<a href="https://colab.research.google.com/github/Yadukrishnan1/Kaggle-comps/blob/main/room_availability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Python 3 environment 
import numpy as np              # Linear algebra
import pandas as pd             # Data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns           # Data visualization
import matplotlib.pyplot as plt # Data visualization
import os                       # Operating System library
from os import path
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [None]:
# ML Libraries
import sys
!{sys.executable} -m pip install xgboost sklearn

from sklearn.preprocessing import StandardScaler
import sklearn
from sklearn.metrics import mean_squared_error, f1_score, roc_curve, auc, roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split, KFold, RandomizedSearchCV
from xgboost.sklearn import XGBRegressor, XGBClassifier
import xgboost as xgb


# Reading the data and exploring to find features and classes

In [None]:
df=pd.read_csv('')
FEATURES=list(df.columns[0:30]) # Change the columns according to dataset
TARGET='yearly_availability'
df.head()

# Function for a quick EDA (Exploratory Data Analysis)

In [None]:
def EDA(df):
  print('The column name, Dtype, and Null-count\n\n', df.info())
  print('The descriptory statistics of the features and label\n\n', df.describe())
  print('The features and the label of the data\n\n', df.columns)
  print('The number of missing values in the data\n\n', df.isnull().sum())


# Checking the class imbalance

In [None]:
# Class imbalance needs to be taken cared of before applying any model

print('Hotels-0 fraction in the data :',len(df[df['yearly_availability']==0])/len(df)*100)
print('Hotels-1 fraction in the data :',len(df[df['yearly_availability']==1])/len(df)*100)

# Class imbalance visualized

fig, axes = plt.subplots(1, 1, figsize=(7,5), dpi=100)
fig.suptitle("Class imbalance", y=1.1, fontsize=18)

sns.set_theme(style="darkgrid")
g=sns.countplot(data=df, x='Class', ax=axes)
axes.set_yscale("log")
axes.set_xlabel("Class Label", fontsize=16)
axes.set_ylabel("Fraction of the labelled data ", fontsize=16)
plt.title('Class Distributions \n (0: No Fraud | 1: Fraud)', fontsize=14)

fig.subplots_adjust(left=0., bottom=0., right=1., top=1.0)
# plt.savefig('class_imbalance_expedia.png', dpi=300, bbox_inches = 'tight')
plt.show()

# Encoding the categorical variables

In [None]:
cat_df = df.select_dtypes(include=['object']).copy()

# Encoding the columns

enc_make = OrdinalEncoder()

cat_df_transformed = enc_make.fit_transform(cat_df)

for i,j in enumerate(cat_df.columns):
  cat_df[j] = cat_df_transformed.transpose()[i]

# Adding converted labels to df
for i in df.columns:
  if i in cat_df.columns:
    df[i] = cat_df[i]

df.head()

# Scaling of the continuous variables

In [None]:
# Scaling of features: All columns except amount and time are scaled using PCA.

from sklearn.preprocessing import StandardScaler, RobustScaler

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

# We will use Robust scaler because it's ideal if there are outliers

df['Amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df=df[[c for c in df if c not in ['Class']] + ['Class']]

# Outlier detection 

In [None]:
# Outlier detection for featuresusing the mean and the standard deviation assuming a Gaussian distribution
# The features to be used: 

def outlier_detection(df, feature):
  df[feature]=df[feature][(df[feature]>df[feature].mean()-df[feature].std()) & (df[feature]<df[feature].mean()+df[feature].std())]
  return df[feature]

outlier_detection(df, '')


# Feature creation: distance from the hotels to a particular destination, in this case, NYC City center.

In [None]:
# Feature creation: Distance between NYC and the hotels using the Haversine Formula

from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [None]:
NYC_lat	= 40.730610
NYC_long=	-73.935242

df['distance_city']=haversine(df[], df[], NYC_long, NYC_lat)

# **Stratified K-fold cross-validation of the data due to imbalance**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

f1score=[]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(df[FEATURES], df[TARGET])):
    X_train, X_valid = df.iloc[train_idx], df.iloc[valid_idx]
    y_train = X_train[TARGET]
    y_valid = X_valid[TARGET]
    X_train = X_train.drop(TARGET, axis=1)
    X_valid = X_valid.drop(TARGET, axis=1)
    
    cbr = CatBoostClassifier(random_state=42)

    cbr =  cbr.fit(X_train, y_train, verbose=False)
    y_pred = cbr.predict(X_valid)
    f1score.append(f1_score(y_valid, y_pred))
    print(f'Fold {fold}: F1: ', f1_score(y_valid, y_pred))

# Sub-sampling the majority class for balancing the data

In [None]:
# Number of fraud classes are fraud_len

avail_df = df[df['yearly_availability'] == 1]
avail_len=len(avail_df)
nonavail_df = df[df['yearly_availability'] == 0][:avail_len]

balanced_df = pd.concat([avail_df, nonavail_df])

# Shuffling the data

balanced_df = balanced_df.sample(frac=1, random_state=None)
balanced_df.head()

# Visualization using boxplot

In [None]:
# Outlier detection using seaborn boxplot
sns.set_theme(style="whitegrid")
fig, axes = plt.subplots(6, 5, figsize=(30, 35))

count=0
for i in range(4):
    for j in range(3):
        sns.boxplot(ax=axes[i, j],data=balanced_df, x='yearly_availability', y=balanced_df.columns[count])
        axes[i,j].set_title(balanced_df.columns[count])
        count+=1

# Feature Selection using K-Best

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(15,10))

# Sub-sample dataframe
sub_sample_corr = balanced_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=axes)
axes.set_title('Sub-sample Correlation Matrix', fontsize=14)
plt.show()

In [None]:
X=balanced_df.drop(columns = "yearly_availability", axis=1)
y=balanced_df['yearly_availability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Chi squared feature selection for categorical data
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# feature selection
def select_features(X_train, y_train, X_test):
	fs = SelectKBest(score_func=chi2, k='all')
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs, fs

# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

# what are scores for the features
for i in range(len(fs.scores_)):
	print('Feature %d: %f' % (i, fs.scores_[i]))
 
# plot the scores
plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
plt.yscale('log')
plt.show()

In [None]:
from sklearn import tree
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import r2_score

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

print("Accuracy (in %):",clf.score(X_test, y_test)*100)

y_pred = clf.predict(X_test)
print('F1 score', f1_score(y_test, y_pred))

plot_confusion_matrix(clf, X_test, y_test)  
print(precision_recall_fscore_support(y_test, y_pred, average='binary'))

plt.show()

In [None]:
importance=clf.feature_importances_
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=300)
clf = clf.fit(X_train, y_train)

print("Accuracy (in %):",clf.score(X_test, y_test)*100)

y_pred = clf.predict(X_test)
print('F1 score', f1_score(y_test, y_pred))

plot_confusion_matrix(clf, X_test, y_test)  
print(precision_recall_fscore_support(y_test, y_pred, average='binary'))

plt.show()

In [None]:
importance=clf.feature_importances_
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=2, random_state=42).fit(X_train, y_train)

print("Accuracy (in %):",clf.score(X_test, y_test)*100)

y_pred = clf.predict(X_test)
print('F1 score', f1_score(y_test, y_pred))

plot_confusion_matrix(clf, X_test, y_test)  
print(precision_recall_fscore_support(y_test, y_pred, average='binary'))


plt.show()

In [None]:
importance=clf.feature_importances_
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
mean_df=data.mean()
std_df=data.std()

for i in data.columns:
    df=data[(data > (data.mean() - 2*data.std())) & (data <=(data.mean() + 2*data.std()))]

df.dropna()