<a href="https://colab.research.google.com/github/apoorvaec1030/Python-practice/blob/main/restaurant_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Restaurant problem

In [1]:
import pandas as pd
import numpy as np
from collections import Counter


df=pd.DataFrame({'billid':np.random.randint(1,15,size=20),'custid':np.random.randint(100,105,size=20),'spend':np.random.randint(500,5000,size=20)})


In [None]:
#1. Identify Duplicate Bills-Write a function to find and return bill IDs that appear more than once in the dataset.
d=Counter(df.billid)

c=[billid for billid,count in Counter(df.billid).items() if count>1 ]

print(c)


#Follow-up: How would you handle duplicate billids if they represent incorrect data?
#exclude duplicates

df.drop_duplicates(subset='billid',keep='last')



In [None]:
#with unique billids
df=pd.DataFrame({'billid':np.random.randint(1,20,size=20),'custid':np.random.randint(100,105,size=20),'spend':np.random.randint(500,5000,size=20)})

#2.Compute Customer-Level Spend Statistics-For each customer (custid), compute the total, average, and standard deviation of their spending.
gp_df=df.groupby('custid').agg({'spend':['sum','mean','std']}).reset_index()
gp_df.columns=['custid','total','average','std']
np.round(gp_df)

#Follow-up: How would you handle cases where a customer has only one record (NaN std)?
#update the dataset in custid from 105 to 118
df=pd.DataFrame({'billid':np.random.randint(1,20,size=20),'custid':np.random.randint(100,118,size=20),'spend':np.random.randint(500,5000,size=20)})
gp_df=df.groupby('custid').agg({'spend':['sum','mean','std']}).reset_index()
gp_df.columns=['custid','total','average','std']
np.round(gp_df.fillna(0))


In [None]:

#3.Find the Top 3 High-Spending Customers-Identify the top 3 customers who have spent the most in total.
df.groupby('custid')['spend'].max().sort_values(ascending=False).index[:3]

#Follow-up: Modify the function to return customers who are in the top 10% spenders instead.
df.groupby('custid')['spend'].median().reset_index()
df[df.spend>np.percentile(df.spend,90)]


In [None]:
#4.Detect Anomalous Spending Behavior-Implement an outlier detection method (e.g., IQR or Z-score) to flag suspiciously high spending records.
#IQR is robust of extreme values - good for skewed data but not worth for small data size ; zscore is sensitive to extreme values - good for normal distribution and small datasets
Q1=np.quantile(df.spend,0.25)
Q3=np.quantile(df.spend,0.75)

IQR=Q3-Q1

df[(df.spend<(Q1-1.5*IQR)) | (df.spend>(Q3+1.5*IQR))]


# Follow-up: How would this change if we use Z-score instead of IQR?
#zscore method
df=pd.DataFrame({'billid':np.random.randint(1,20,size=20),'custid':np.random.randint(100,105,size=20),'spend':np.random.randint(500,5000,size=20)})

from scipy import stats
df['zscore']=np.abs(stats.zscore(df.spend))
df[df.zscore>3].drop('zscore',axis=1)


In [None]:
#include date in the dummy dataset
df=pd.DataFrame({'billid':np.random.randint(1,20,size=100)
                ,'custid':np.random.randint(100,153,size=100)
                ,'spend':np.random.randint(500,5000,size=100)
                ,'date':pd.to_datetime(np.random.randint(19959,20959,size=100),unit='D')})

#5.Predict Next Bill Amount Using a Simple Model-Given a customer’s past spending, predict their next bill amount using a basic regression model.

#1.sort trasnx cust - date wise
#2.create cust spend features
#3.split X&Y data
#4.call model, fit, predict
#5.evaluate MAE, R2, adjR2
#6.predict on X

#step1
df=df.sort_values(by=['custid','date']).reset_index(drop=True)


#step2 feature creation
#date-first,last transx,days since last trasn, spend-total,avg,last , trasx=count of trax

feature_df=df.groupby('custid').agg(first_tranx=('date','first')
                        ,last_tranx=('date','last')
                        ,total_spend=('spend','sum')
                        ,avg_spend=('spend','mean')
                        ,last_spend_amount=('spend','last')
                        ,num_tranx=('billid','count')).reset_index()

feature_df['days_since_last_tranx']=np.abs(pd.Timestamp.now()-feature_df['last_tranx']).dt.days


#step3
X=feature_df[['total_spend','avg_spend','num_tranx','days_since_last_tranx']]
y=feature_df['last_spend_amount']

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

#step4

from sklearn.linear_model import LinearRegression

model=LinearRegression()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)

#step5

from sklearn.metrics import mean_absolute_error,r2_score

mae=mean_absolute_error(y_test,y_pred)
r2_score=r2_score(y_test,y_pred)

#step6
#predict

feature_df['predicted_spend']=model.predict(X)
np.round(feature_df)
# Follow-up: How would you extend this model using RNNs (Recurrent Neural Networks) for more accurate forecasting?