# IMPORTING LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import time 
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
%matplotlib inline

# IMPORTING DATA 

In [None]:
df = pd.read_csv('/root/Desktop/bigmart_sales.csv')

In [None]:
df.head()

# CHECKING NULL VALUES

In [None]:
df.isnull().sum()

# TREATING THE MISSING VALUES

In [None]:
df["Item_Weight"].mean()

In [None]:
df["Item_Weight"] = df["Item_Weight"].fillna(12.85)

In [None]:
df.isnull().sum()

In [None]:
df['Outlet_Size'].fillna(df['Outlet_Size'].mode().iloc[0],inplace=True)

In [None]:
df.isnull().sum()

# CORRELATION BETWEEN THE COLUMNS

In [None]:
sns.heatmap(df.corr() , annot=True)
plt.show()

# CHECKING DATA TYPES AND IF DIFFERNT CONVERTING IT TO THE REQUIRED DATA TYPE

In [None]:
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

for i in df.columns:
    if isinstance(df[i][0], str):
            df[i] = encoder.fit_transform(df[i])

In [None]:
df.dtypes

In [None]:
df['Item_Weight'] = df['Item_Weight'].astype(np.int64)

In [None]:
df['Item_MRP'] = df['Item_MRP'].astype(np.int64)

In [None]:
df['Item_Outlet_Sales'] = df['Item_Outlet_Sales'].astype(np.int64)

In [None]:
df.dtypes

# CHECKIING OUTLIERS

In [None]:
plt.scatter(df.index, df['Item_Fat_Content'])

In [None]:
plt.scatter(df.index, df['Item_MRP'])

In [None]:
plt.scatter(df.index, df['Item_Outlet_Sales'])

# TREATING OUTLIERS

In [None]:
li = list(df['Item_Outlet_Sales'].sort_values()[-30:].index)

In [None]:
df['Item_Outlet_Sales'][li] = int(df.drop(li)['Item_Outlet_Sales'].mean())

In [None]:
plt.scatter(df.index, df['Item_Outlet_Sales'])

In [None]:
X = df.drop(['Item_Outlet_Sales'], axis=1)
y = df['Item_Outlet_Sales']

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(X)

pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns).sort_values()

In [None]:
df.values.shape

from sklearn.preprocessing import StandardScaler 
  
scalar = StandardScaler() 
  
scalar.fit(X) 
scaled_data = scalar.transform(X) 

import scipy.stats as stats
from scipy.stats import chi2_contingency

Significance = pd.Series()

for i in X.columns:
    dfObserved = pd.crosstab(X[i],y)
    chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
    Significance[i] = (1-p)*100

Significance.sort_values()

Significance[Significance >= 99.9].index

X_ = X[Significance[Significance >= 99.9].index]

In [None]:
log = LogisticRegression()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import AdaBoostClassifier

In [None]:
#log_boost = AdaBoostClassifier(base_estimator = log,n_estimators = 30)

In [None]:
#log_boost.fit(X_train,y_train)
#pred= log_boost.predict(X_test)

In [None]:
log.fit(X_train,y_train)

In [None]:
pred = log.predict(X_test)
pred

In [None]:
log.score(X_test, pred)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
accuracy_score(y_test,pred)*100

In [None]:
confusion_matrix(y_test,pred)

# SVM

In [None]:
from sklearn import svm
from sklearn.svm import SVC

In [None]:
X = df.drop(['Item_Outlet_Sales'], axis=1)
y = df['Item_Outlet_Sales']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) 

In [None]:
model = SVC()
model.fit(X_train,y_train)
pred= model.predict(X_test)
#model.fit(X_train, y_train)

In [None]:
pred1 = model_boost.predict(X_test)

In [None]:
print(confusion_matrix(y_test,pred1))

In [None]:
print(classification_report(y_test,pred1)) 

In [None]:
print("(in %):",accuracy_score(y_test, pred1)*100)

# GRID SEARCH

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}

In [None]:
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))

# NAIVE BAYES

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [None]:
X = df.drop(['Item_Outlet_Sales'], axis=1)
y = df['Item_Outlet_Sales']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) 

In [None]:
gnb = GaussianNB() 

In [None]:
gnb.fit(X_train, y_train) 

In [None]:
pred2 = gnb.predict(X_test)
pred2

In [None]:
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, pred2)*100)