In [None]:
# Importing libraries that we are going to use in this project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

In [None]:
# Reading csv file using pandas library
df = pd.read_csv("tips.csv")
# view only top 3 rows of dataframe
df.head(3)

# Extracting columns of our dataframe
df.columns

# Extracting info of every column to check if there is any mistake in data types of each column
df.info()

# Extracting the sum of null values from each column
df.isnull().sum()

# extracting the summary of our dataset like total count, mean and min etc.
df.describe()

# Extracting counts of each day 
day_unique = df["day"].value_counts()
day_unique
# Extracting counts of each time 
time_unique = df['time'].value_counts()
time_unique

# extracting all the days uniquely to see how many days are in our dataframe/dataset
day_unique = df["day"].unique()
day_unique
# extracting all the times uniquely to see what times are in our dataframe/dataset
time_unique = df['time'].unique()
time_unique

# Count the number of unique values in each column
df.nunique()

# Printing the unique values for each column
for i in df.columns:
    print(df[i].unique())

# Remove rows with any null (NaN) values from the DataFrame
update_df = df.dropna()
update_df

# removed all rows with any null values from the original DataFrame df and updated it in place.
df.dropna(inplace=True)
# View only top 4 rows from dataset
df.head(4)

# counting the occurrences of each unique value in the 'day' column, converting the result into a DataFrame, and then resetting the index to turn the counts into a standard column
value_counts = pd.DataFrame(df['day'].value_counts())
value_counts.reset_index(inplace=True)
value_counts

# renaming column names to represent the index,day and count
value_counts.rename_axis(columns={'':'index','day':'day','count':'count'},inplace=True)
value_counts

value_counts.reset_index(inplace=True)

value_counts

# plotting a bar chart between day and its count to showcase which day has more counts
plt.bar(value_counts['day'],value_counts['count'])

# plotting a pie chart of count and day 
plt.pie(value_counts['count'],labels=value_counts['day'],autopct='%1.2f',explode=[0.2,0,0,0])

import seaborn as sns
# creating a distplot of total bill
sns.distplot(df['total_bill'])

# total_bill
total_bills = pd.DataFrame(df['total_bill'])
total_bills.reset_index(inplace=True)
plt.bar(total_bills['total_bill'].index,total_bills['total_bill'].values)

df['total_bill'].plot(kind='hist')

# Tip
df['tip'].plot(kind='hist')

# Sex
df['sex'].value_counts().plot(kind='bar')

sns.countplot(df['sex'])

df['smoker'].value_counts().plot(kind='bar',rot=0)

# Time
df['time'].value_counts().plot(kind='bar')

# Size
df['size'].value_counts().plot(kind='bar')

# Scatter plot on total_bill - Tip
plt.scatter(df['total_bill'],df['tip'])
plt.xlabel('Total bill',fontsize=15)
plt.ylabel('Tips',fontsize=15)

df.groupby('sex')['total_bill'].mean().plot(kind='bar')

df.groupby('smoker')['total_bill'].mean().plot(kind='bar')

df.groupby('day')['total_bill'].mean().plot(kind='bar')

df.groupby('time')['total_bill'].mean().plot(kind='bar')

# count plot between day and time
sns.countplot(data=df,x='day',hue='time')

# SCATTER PLOT OF TOTAL_BILL,TIP AND SEX
sns.scatterplot(data=df,x='total_bill',y='tip',size='size',hue='sex')

# SCATTER PLOT OF TOTAL_BILL,TIP AND SMOKER
sns.scatterplot(data=df,x='total_bill',y='tip',size='size',hue='smoker')

# SCATTER PLOT OF TOTAL_BILL,TIP AND DAY
sns.scatterplot(data=df,x='total_bill',y='tip',size='size',hue='day')

# SCATTER PLOT OF TOTAL_BILL,TIP AND TIME
sns.scatterplot(data=df,x='total_bill',y='tip',size='size',hue='time')

# calculating corelation to generate heatmap
corr_matrix = df[['total_bill','tip','size']].corr()
corr_matrix

sns.heatmap(corr_matrix,annot=True)

x=df['total_bill']
y=df['tip']

df['total_bill'].value_counts(normalize=True)
df['tip'].value_counts(normalize=True)

df.head()

sns.scatterplot(data=df,x='total_bill',y='tip')
plt.title("Total Bill vs Tip")
plt.xlabel("Total Bill")
plt.ylabel("Tips")

from sklearn.model_selection import train_test_split

# spliting the dataset into training and tests set
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.2,random_state=42)

if isinstance(X_test, pd.Series):
    X_test = X_test.to_numpy().reshape(-1, 1)
elif isinstance(X_test, np.ndarray):
    X_test = X_test.reshape(-1, 1)
else:
    raise TypeError("X_test must be a pandas Series or a numpy array")

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, Y_train)

y_pred=model.predict(X_test)

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
mae=mean_absolute_error(Y_test,y_pred)
mse = mean_squared_error(Y_test,y_pred)
r2 = r2_score(Y_test,y_pred)
print("MAE: ",mae)
print("MSE: ",mse)
print("R2: ",r2)

plt.scatter(Y_test,y_pred)
plt.plot(Y_test,Y_test,color='r',label='Perfect Fit')
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Mean Squared Error: "+str(mse))
plt.legend()

sns.scatterplot(data=df,x='total_bill',y='tip',color='black')
plt.plot(X_test,y_pred,color='red',linewidth=2)
plt.title("Regression on test data")
plt.xlabel("Total_Bill")
plt.ylabel("Tips")