# Statistical Analysis and Prediction of Stores Sales 
### Akshay Kumar
### Student no: 202095271
#### Due Date: 13 June 2022

## Importing the libraries

In [36]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

## Reading the data using pandas

In [56]:
class ReadData(object):
    def __init__(self,pathname):
        self.pathname = pathname
    
    def getDataFrame(self):
        df = pd.read_csv(self.pathname)
        return df
        
        



## Performing statistical operations using Numpy and Scipy

In [121]:
class Calculation(object):
    
    
    def __init__(self,dataframe):
        self.df = dataframe
        self.thisdict = {}
        
        
    def getStatistics(self,columnIndex):
        #first convert the dataframe to np array
        arr = df.to_numpy()
        
        #calculating mean, median and standard deviation

        col_mean = np.mean(arr,axis=0)
        mean = col_mean[columnIndex]
        self.thisdict['Mean'] = mean
        print("Mean:",mean)

        #To calcualte the median, we will have to sort the array in ascending order first
        sorted_arr = arr[arr[:, columnIndex].argsort()]
        col_median = np.median(sorted_arr,axis=0)
        self.thisdict['Median'] = col_median[columnIndex]
        print('Median is:',col_median[columnIndex])

        #Calculating standard deviation
        st_dv = np.std(arr,axis=0)
        self.thisdict['Standard Deviation'] = st_dv[columnIndex]
        print('Standard Deviation of is:',st_dv[columnIndex])

        percentile = np.percentile(arr,axis=0,q=75)
        self.thisdict['75% Percentile'] = percentile[columnIndex]
        print('75% percentile of store sales:',percentile[columnIndex])
        
        
        #writing all the computed data in csv file using pandas
        pd.DataFrame.from_dict(data=self.thisdict, orient='index').to_csv('dict_file.csv', header=False)
        
        
        return self.thisdict
        
    def getCorelation(self,firstColumnIndex,secondColumnIndex,dataframe):
        #using scipy pearsonr function to find corelation between features of the dataset. Such that how strong is the relation between store area and store sales or customer_count and store sales
        #Correlations of -1 or +1 imply an exact linear relationship
        arr = dataframe.to_numpy()
        r = stats.pearsonr(arr[firstColumnIndex],arr[secondColumnIndex])
        print(r)
        


## Creating some visualizations

In [109]:
class Visualizations(object):
    def __init__(self,dataframe):
        self.df = dataframe
        
    def firstscatterplot(self):
        f, ax = plt.subplots(figsize=(6, 6))
        ax.scatter(x=df['Store_Area'],y=df['Daily_Customer_Count'], alpha=0.5,c="g")
        ax.set_xlabel("Store Area")
        ax.set_ylabel("Customer count")
        #ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
        ax.set_title('Does the store area affect customer count')
        f.show()

    def secondscatterplot(self,firstVariableName,secondVariableName):
        f, ax = plt.subplots(figsize=(6, 6))
        ax.scatter(x=df[firstVariableName],y=df[secondVariableName], alpha=0.5,c="m")
        ax.set_xlabel(firstVariableName)
        ax.set_ylabel(secondVariableName)
        ax.set_title('Does {} affect {}'.format(firstVariableName,secondVariableName))
        f.show()
        
    def histplot(self,columnName):
        #create histogram to see the distribution of store sales
        plt.figure(figsize=(7, 7))
        plt.title("Distribution of {}".format(columnName))
        sns.histplot(df[columnName], stat='density',color='green')
        sns.kdeplot(df[columnName], color='black')
        plt.axvline(df[columnName].mean(), color='red', linestyle='--', linewidth=0.8)
        min_ylim, max_ylim = plt.ylim()
        plt.text(df[columnName].mean()*1.05, max_ylim*0.96, 'Mean (μ): {:.2f}'.format(df[columnName].mean()))
        plt.xlabel("Score")
        plt.ylabel("Density")
        plt.show()
        
        


## Using Machine learning to predict Store sales based on Store Area and Items Available

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error



models = [LinearRegression(),KNeighborsRegressor()]

class Predict():
    
    def __init__(self,x,y):
        self.x = x
        self.y = y
        
    
    def trainAndTestModel(self):
        X_train, X_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)
        for m in models:
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)

           # print(f'model: {str(m)}')
           # print(f'RMSE: {mean_squared_error(y_test,y_pred)}')
           # print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
           # print('-'*30, '\n')
        
    
    def predict(self,x_pred):
        for m in models:
           # print(x_pred)
            predict_values = m.predict(x_pred)
            print('Store Sales predicted by {}: {}'.format(m,predict_values))
        
        


