## tips_analysis.ipynb

In [None]:
import pandas as pd
import pandas_profiling

In [None]:
tips = pd.read_csv('tips.csv')

In [None]:
tips.head() # tips.head(10)

In [None]:
tips.tail()

In [None]:
tips.columns

In [None]:
tips.info()

In [None]:
tips.describe()

In [None]:
tips.profile_report()

In [None]:
tips['day'].unique()

In [None]:
tips['day'].value_counts()

## restaurant-tip

### Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardSacaler

from sklearn.linear_model import LinearRegression

In [None]:
data = pd.read_csv('.\\tips.csv')

In [None]:
data

In [None]:
data.info()

### Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Binary encoding
    df['sex'] = df['sex'].replace({'Female': 0, 'Male': 1})
    df['smoker'] = df['smoker'].replace({'No': 0, 'Yes': 1})
    df['time'] = df['time'].replace({'Lunch': 0, 'Dinner': 1})
    
    # Ordinal encoding
    df['day'] = df['day'].replace({'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3})
    
    return df

In [None]:
X = preprocess_inputs(data)

In [None]:
X

### Exploratory Data Analysis

In [None]:
plt.figure(figsize = (20, 10))

for i in range(len(X.columns)):
    plt.subplot(2, 4, i + 1)
    if len(X[X.columns[i]].unique()) > 2:
        sns.histplolt(X[X.columns[i]], kde = True)
    else:
        sns.histplot(X[X.columns[i]], kde = False)
    if i == 0:
        plt.title('column Distributions')
        
plt.show()

In [None]:
plt.figure(figsize = 20, 10))

for i in range(len(X.columns)):
    plt.subplot(2, 4, i + 1)
    sns.boxplot(x = X[X.columns[i]])
    if i == 0:
        plt.title('Column Boxplots')

In [None]:
sns.pairplot(X.loc[:, ['total_bill', 'tip']])

In [None]:
plt.figure(figsize = (12, 10))
sns.heatmap(X.corr(), annot = True, vmin = -1.0, cmap = 'mako')
plt.title('Correlation Matrix')
plt.show()

In [None]:
X.corr()

### Training

In [None]:
X = X.sample(frac = 1.0, random_state = 1).reset_index(drop = True)

In [None]:
y = X['tip']
x = X.drop('tip', axis = 1)

In [None]:
results = []
kf = kFold(n_splits = 5)

for train_idx, test_idx in kf.split(X):
    
    x_train = X.iloc[train_idx, :]
    x_test = X.iloc[test_idx, :]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train = pd.DataFrame(scaler.transform(x_train), index = x_train.index, columns = x_train.columns)
    x_test = pd.DataFrame(scaler.transform(x_text), index = x_test.index, columns = x_test.columns)
    
    model = LinearRegression()
    model.fit(x_train, y_train)
    
    result.append(model.score(x_test, y_test))

### Result

In [None]:
for i in range(len(results)):
    print("Fold {}: {:.5f}". format(i + 1, results[i]))
    
print('\nAverage R^2 Score: {:.5f}'. format(np.mean(results)))

In [None]:
results

In [None]:
len(y_test)

In [None]:
y_test.max()