In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
train = pd.read_csv("../Big_Mart_Sales/Train.csv")
train.head()

### Total Columns and Rows in Dataset

In [None]:
train.shape

### Data Visualization

In [None]:
sb.countplot(train.Item_Type, palette='winter')
plt.xticks(rotation = 90)
plt.show()

In [None]:
sb.countplot(train.Outlet_Size, palette='winter')
plt.show()

In [None]:
sb.countplot(train.Outlet_Type, palette='winter')
plt.xticks(rotation = 90)
plt.show()

In [None]:
sb.countplot(train.Outlet_Establishment_Year, palette='winter')
plt.xticks(rotation = 90)
plt.show()

In [None]:
sb.countplot(train.Outlet_Identifier, palette='winter')
plt.xticks(rotation = 90)
plt.show()

In [None]:
sb.barplot(x = train.Item_Outlet_Sales, y = train.Outlet_Identifier, palette='prism')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sb.barplot(x = train.Item_Outlet_Sales, y = train.Item_Type, palette='prism')
plt.show()

In [None]:
sb.barplot(x = train.Item_Outlet_Sales, y = train.Outlet_Location_Type, palette='prism')
plt.show()

In [None]:
sb.barplot(x = train.Item_Outlet_Sales, y = train.Outlet_Size, palette='prism')
plt.show()

In [None]:
sb.barplot(x = train.Item_Outlet_Sales, y = train.Outlet_Size, hue=train.Outlet_Location_Type, palette='prism')
plt.show()

### Statistical Description

In [None]:
train.describe()

### Here Item visibility can't have a minimum value 0
### So, treat 0 as a missing information

### Check NaN (missing values) values

In [None]:
train.isnull().sum()

### Check missing values represented by other symbols if any

In [None]:
train.dtypes

In [None]:
train.describe(include=['object'])

In [None]:
cat_features = train.describe(include=['object']).columns.tolist()
cat_features

### Check unique values for these cat_features: 

In [None]:
for i in cat_features:
    print("Column:",i)
    print(train[i].unique())
    print("")

### Unique values for Item Identifier

In [None]:
train['Item_Identifier'].nunique()

In [None]:
for i in train['Item_Identifier'].unique():
    print(i, end=" ")

### Here in Item Fat Content: Low Fat, low fat and LF are same as Low Fat and Regular and reg are same as Regular. So will correct these preprocessing

In [None]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace("low fat", "Low Fat")
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace("LF", "Low Fat")
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace("reg", "Regular")

In [None]:
train.Item_Fat_Content.unique()

### Missing values handling:

### For Item Weight, we will check Mean value of Item Weight for all items categorized by Item Type and also will check Mean value of Item Weight for all items categorized by Item Identifier

In [None]:
train.Item_Type.unique()

In [None]:
train[train['Item_Type'] == 'Dairy']['Item_Weight'].mean(skipna=True)

In [None]:
for j in train.Item_Type.unique():
    avg = train[train['Item_Type'] == j]['Item_Weight'].mean(skipna = True)
    print("Average Item Weight of all Items for Item Type %s is %f" %(j, avg))

In [None]:
for j in train.Item_Identifier.unique():
    avg = train[train['Item_Identifier'] == j]['Item_Weight'].mean(skipna = True)
    print("Average Item Weight of all Items for Item Identifier %s is %f" %(j, avg))

### We will select second approach i.e. wherever there are missing info of Item Weight for Unique Item Identifier, we will replace those missing info with Mean Value

In [None]:
iwvalues = []; iwindex = []

for j in train.Item_Identifier.unique():
    
    avg = train[train['Item_Identifier'] == j]['Item_Weight'].mean(skipna = True)
    
    result = train[train['Item_Identifier'] == j]['Item_Weight'].fillna(round(avg, 2))
    
    for k in result.values:
        iwvalues.append(k)
    for k in result.index:
        iwindex.append(k)

In [None]:
result = pd.Series(iwvalues, index=iwindex)
result.head(10)

In [None]:
result.sort_index(inplace=True)

In [None]:
result.head(10)

### Checking index positions for columns

In [None]:
for i in enumerate(train.columns):
    print(i)

### Drop Original Item Weight Column and Place new same named column containing non null values

In [None]:
train.drop('Item_Weight', axis=1, inplace=True)

In [None]:
train.insert(1, 'Item_Weight', result.values)

### Check Missing Info for Item Weight

In [None]:
train.isnull().sum()

### Let's find out which FOUR observations we have missing info

In [None]:
f = train['Item_Weight'].isnull()
f.head()

In [None]:
j = train['Item_Weight'].isnull()
for i,k in zip(j.index, j.values):
    if k == True:
        print("Row Label:",i)
        print(train.loc[i])
        print("")

### Drop these 4 Rows

In [None]:
j = train['Item_Weight'].isnull()
for k, m in zip(j.index, j.values):
    if m == True:
        train.drop(labels = k, axis=0, inplace=True)

In [None]:
train.isnull().sum()

### Missing values handling for Item Visibility same as we have done for Item Weight

In [None]:
train[train['Item_Visibility'] == 0]

In [None]:
train['Item_Visibility'].replace(0.0, np.NaN, inplace = True)

In [None]:
train.isnull().sum()

In [None]:
for j in train.Item_Type.unique():
    avg = train[train['Item_Type'] == j]['Item_Visibility'].mean(skipna = True)
    print("Average Item Visibility of all Items for Item Type %s is %f" %(j, avg))

In [None]:
for j in train.Item_Identifier.unique():
    avg = train[train['Item_Identifier'] == j]['Item_Visibility'].mean(skipna = True)
    print("Average Item Visibility of all Items for Item Identifier %s is %f" %(j, avg))

In [None]:
iwvalues = []; iwindex = []

for j in train.Item_Identifier.unique():
    
    avg = train[train['Item_Identifier'] == j]['Item_Visibility'].mean(skipna = True)
    
    result = train[train['Item_Identifier'] == j]['Item_Visibility'].fillna(avg)
    
    for k in result.values:
        iwvalues.append(k)
    for k in result.index:
        iwindex.append(k)

In [None]:
result = pd.Series(iwvalues, index=iwindex)
result.head()

In [None]:
result.sort_index(inplace=True)

In [None]:
result.head()

In [None]:
train.drop('Item_Visibility', axis=1, inplace=True)

In [None]:
train.insert(3, 'Item_Visibility', result.values)

In [None]:
train.isnull().sum()

### Missing values handling for Outlet Size

### Check Unique values of Outlet Identifier, and find out Outlet Size for each Outlet Identifier

In [None]:
train.Outlet_Identifier.unique()

In [None]:
train.groupby('Outlet_Identifier')['Outlet_Size'].value_counts()

### Find info for each unique Outlet identified by its Outlet Identifier including Outlet Size, Outlet Location Type and Outlet Type

In [None]:
for i in train['Outlet_Identifier'].unique():
    os = train[train['Outlet_Identifier'] == i]['Outlet_Size'].values[0]
    olt = train[train['Outlet_Identifier'] == i]['Outlet_Location_Type'].values[0]
    ot = train[train['Outlet_Identifier'] == i]['Outlet_Type'].values[0]
    print("Outlet_Identifier:", i, "| Outlet_Size:", os)
    print("Outlet_Location_Type:", olt, "| Outlet_Type:", ot)
    print("")

### After analysis, we came across that OUT010 should have outlet size Small, OUT017 should have Small and OUT045 should also have Small Outlet Size

### Make these replacements

In [None]:
r1 = train[train['Outlet_Identifier'] == 'OUT010']['Outlet_Size'].fillna("Small")

In [None]:
r2 = train[train['Outlet_Identifier'] == 'OUT017']['Outlet_Size'].fillna("Small")

In [None]:
r3 = train[train['Outlet_Identifier'] == 'OUT045']['Outlet_Size'].fillna("Small")

In [None]:
result = pd.concat([r1, r2, r3], axis=0)

In [None]:
result.head()

In [None]:
result.sort_index(inplace=True)

In [None]:
result.head()

### Make a series which contain Non Null values of Outlet Size

In [None]:
osindex = []; osvalues = []
for i, j in train['Outlet_Size'].isnull().items():
    if j == False:
        osindex.append(i)
        osvalues.append(train['Outlet_Size'][i])

In [None]:
remaining_items = pd.Series(osvalues, index=osindex)
remaining_items.head()

### Concat both these series

In [None]:
result = pd.concat([result, remaining_items], axis=0)

In [None]:
result.sort_index(inplace=True)

In [None]:
result.head()

### Drop original Outlet Size column which contain Null Values and Replace the new column named as same which will contain Non Null Values

In [None]:
train.drop('Outlet_Size', axis=1, inplace=True)

In [None]:
train.insert(8, 'Outlet_Size', result)

In [None]:
train.isnull().sum()

### Check Outliers presence in Numeric Columns

In [None]:
train.dtypes

In [None]:
numeric_cols = [i for i in train.columns.tolist() if train[i].dtype == float]

In [None]:
numeric_cols

In [None]:
for i in numeric_cols:
    print("Column:", i)
    sb.boxplot(train[i])
    plt.show()
    print("")

In [None]:
trainnew = train.copy()

### Remove Outliers

In [None]:
def outlier_check():
    outliers = set()

    for i in ['Item_Visibility', 'Item_Outlet_Sales']:
        q1 = np.percentile(trainnew[i], 25)
        q3 = np.percentile(trainnew[i], 75)
        iqr = q3 - q1
        c1 = q1 - (1.5*iqr)
        c2 = q3 + (1.5*iqr)
        result = trainnew[(trainnew[i] > c2) | (trainnew[i] < c1)]
        print("Outliers present in column %s are %d" %(i, result.shape[0]))
        outliers.update(result.index.tolist())

    print("Total Outliers to be Remove:", len(outliers))
    trainnew.drop(labels=list(outliers), axis=0, inplace=True)
    print("Total Rows Remaining in Dataset:", trainnew.shape[0])

In [None]:
outlier_check()

In [None]:
trainnew.shape

In [None]:
trainnew.head()

### Create a new column which contains age of the outlet

In [None]:
import datetime

In [None]:
today = datetime.date.today()

In [None]:
today

In [None]:
today.year

In [None]:
trainnew['Age_of_Outlet'] = trainnew['Outlet_Establishment_Year'].apply(lambda i : today.year - i)

In [None]:
trainnew.head()

### Drop unnecessary columns Item Identifier, Outlet Identifier, Outlet Establishment Year

In [None]:
trainnew.drop(['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

In [None]:
trainnew.head()

In [2]:
r2scores = []; RMAE = []; algos = []

### Prepare data for Linear Regression:

In [None]:
trainnewlr = trainnew.copy()

### Seperate features and target

In [None]:
X = trainnewlr.drop("Item_Outlet_Sales", axis = 1)
y = trainnewlr['Item_Outlet_Sales']

### Applying standard scaling

In [None]:
X.head()

In [None]:
X_scaling =  ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Age_of_Outlet']

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scalar = StandardScaler()

In [None]:
for i in X_scaling:
    X[i] = scalar.fit_transform(X[i].values.reshape(-1,1))

In [None]:
X.head()

In [None]:
y = scalar.fit_transform(y.values.reshape(-1,1))

In [None]:
y

### Dummy variable creation

In [None]:
cat_features = X.describe(include=['object']).columns.tolist()

In [None]:
cat_features

In [None]:
pd.get_dummies(X[cat_features]).head()

In [None]:
X = pd.get_dummies(X, columns = cat_features, drop_first = True)

In [None]:
X.head()

### Check relationship between IV and DV

In [None]:
df_train = pd.read_csv("../Big_Mart_Sales/Train.csv")
df_train.head()

In [None]:
df_train.corr()

### Train Test Split for Linear Regression

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Making predictive model using Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_lr = LinearRegression()

In [None]:
model_lr.fit(x_train, y_train)

In [None]:
y_predict = model_lr.predict(x_test)

### Evaluate the Model

In [4]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
print("R2 Score:", r2_score(y_test, y_predict))

In [None]:
print("MSE:", mean_squared_error(y_test, y_predict))

In [None]:
print("MAE", mean_absolute_error(y_test, y_predict))

In [None]:
print("RMAE:", np.sqrt(mean_absolute_error(y_test, y_predict)))

In [None]:
r2scores.append(r2_score(y_test, y_predict))
RMAE.append(np.sqrt(mean_absolute_error(y_test, y_predict)))
algos.append("Linear Regression")

### Prepare data for Non Linear Regression Algos

In [None]:
trainnewnl = trainnew.copy()

### Seperate features and target

In [None]:
X = trainnewnl.drop("Item_Outlet_Sales", axis = 1)
y = trainnewnl['Item_Outlet_Sales']

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label = LabelEncoder()

In [None]:
cat_features = X.describe(include=['object']).columns.tolist()

In [None]:
cat_features

In [None]:
for i in cat_features:
    X[i] = label.fit_transform(X[i])

### One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
for i in enumerate(X.columns):
    print(i)

In [None]:
ohc = OneHotEncoder(categorical_features=[3,5,6,7])

In [None]:
X = ohc.fit_transform(X)

In [None]:
X

In [None]:
X = X.toarray()

In [None]:
X

In [None]:
np.set_printoptions(suppress=True)

In [None]:
X

### Prepare data for Support Vector Regressor

In [None]:
X_svr = X.copy()

In [None]:
y_svr = y.copy()

### Applying Standard Scaling

In [None]:
X_svr[0]

In [None]:
X_svr[:, -1]

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scalar = StandardScaler()

In [None]:
X_svr[:, -1] = scalar.fit_transform(X_svr[:, -1].reshape(-1,1)).flatten()

In [None]:
X_svr[:, -2] = scalar.fit_transform(X_svr[:, -2].reshape(-1,1)).flatten()

X_svr[:, -3] = scalar.fit_transform(X_svr[:, -3].reshape(-1,1)).flatten()

X_svr[:, -5] = scalar.fit_transform(X_svr[:, -5].reshape(-1,1)).flatten()

In [None]:
X_svr[0]

In [None]:
y_svr = scalar.fit_transform(y_svr.values.reshape(-1,1))

In [None]:
y_svr

### Train Test Split for SVR

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_svr, y_svr, test_size = 0.2)

### Making predictive model using Support Vector Regressor

In [None]:
from sklearn.svm import SVR

In [None]:
model_svr = SVR()

In [None]:
model_svr.fit(x_train, y_train)

In [None]:
y_predict = model_svr.predict(x_test)

### Evaluate the Model

In [None]:
print("R2 Score:", r2_score(y_test, y_predict))

In [None]:
print("MSE:", mean_squared_error(y_test, y_predict))

In [None]:
print("MAE", mean_absolute_error(y_test, y_predict))

In [None]:
print("RMAE:", np.sqrt(mean_absolute_error(y_test, y_predict)))

In [None]:
r2scores.append(r2_score(y_test, y_predict))
RMAE.append(np.sqrt(mean_absolute_error(y_test, y_predict)))
algos.append("Support Vector")

### Prepare Data for Decision Tree, Random Forest and XGBoost Regressor
### Train Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

### Making predictive model using Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
model_dt = DecisionTreeRegressor()

In [None]:
model_dt.fit(x_train, y_train)

In [None]:
y_predict = model_dt.predict(x_test)

### Evaluate the Model

In [None]:
print("R2 Score:", r2_score(y_test, y_predict))

In [None]:
print("MSE:", mean_squared_error(y_test, y_predict))

In [None]:
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_predict)))

In [None]:
print("MAE", mean_absolute_error(y_test, y_predict))

In [None]:
print("RMAE:", np.sqrt(mean_absolute_error(y_test, y_predict)))

In [None]:
r2scores.append(r2_score(y_test, y_predict))
RMAE.append(np.sqrt(mean_absolute_error(y_test, y_predict)))
algos.append("Decision Tree")

### Making predictive model using Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model_rf = RandomForestRegressor(n_estimators = 100)

In [None]:
model_rf.fit(x_train, y_train)

In [None]:
y_predict = model_rf.predict(x_test)

### Evaluate the Model

In [None]:
print("R2 Score:", r2_score(y_test, y_predict))

In [None]:
print("MSE:", mean_squared_error(y_test, y_predict))

In [None]:
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_predict)))

In [None]:
print("MAE", mean_absolute_error(y_test, y_predict))

In [None]:
print("RMAE:", np.sqrt(mean_absolute_error(y_test, y_predict)))

In [None]:
r2scores.append(r2_score(y_test, y_predict))
RMAE.append(np.sqrt(mean_absolute_error(y_test, y_predict)))
algos.append("Random Forest")

### Making predictive model using XgBoost Regressor

In [None]:
import xgboost as xg

In [None]:
model_xg = xg.XGBRegressor()

In [None]:
model_xg.fit(x_train, y_train)

In [None]:
y_predict = model_xg.predict(x_test)

### Evaluate the Model

In [None]:
print("R2 Score:", r2_score(y_test, y_predict))

In [None]:
print("MSE:", mean_squared_error(y_test, y_predict))

In [None]:
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_predict)))

In [None]:
print("MAE", mean_absolute_error(y_test, y_predict))

In [None]:
print("RMAE:", np.sqrt(mean_absolute_error(y_test, y_predict)))

In [None]:
r2scores.append(r2_score(y_test, y_predict))
RMAE.append(np.sqrt(mean_absolute_error(y_test, y_predict)))
algos.append("XGBoost")

### Create a Model Report

In [None]:
report = pd.DataFrame({"Algos": algos, "R2 Score": r2scores, "Root_Mean_Absolute_Error": RMAE})
report

### We will select XGBoost as a final Model because it is giving Highest Value of R2 Score

### Following All above things of Data Preprocessing for Test.csv Dataset

In [None]:
test = pd.read_csv("../Big_Mart_Sales/Test.csv")
test.head()

In [None]:
test.shape

In [None]:
test.isnull().sum()

In [None]:
test.describe()

In [None]:
cat_features = test.describe(include=['object']).columns.tolist()
cat_features

In [None]:
for i in cat_features:
    print("Column:", i)
    print("Unique Values:", test[i].unique())
    print("")

### Replace LF low fat as Low Fat and reg as Regular

In [None]:
test.Item_Fat_Content.replace("LF", 'Low Fat', inplace=True)
test.Item_Fat_Content.replace("low fat", 'Low Fat', inplace=True)
test.Item_Fat_Content.replace("reg", 'Regular', inplace=True)

In [None]:
test.Item_Fat_Content.unique()

### Missing values handling for Item Weight 

In [None]:
iwvalues = []; iwindex = []

for j in test.Item_Identifier.unique():
    
    avg = test[test['Item_Identifier'] == j]['Item_Weight'].mean(skipna = True)
    
    result = test[test['Item_Identifier'] == j]['Item_Weight'].fillna(round(avg, 2))
    
    for k in result.values:
        iwvalues.append(k)
    for k in result.index:
        iwindex.append(k)

In [None]:
result = pd.Series(iwvalues, index = iwindex)
result.sort_index(inplace = True)
result.head()

In [None]:
for i in enumerate(test.columns):
    print(i)

In [None]:
test.drop('Item_Weight', axis=1, inplace=True)

In [None]:
test.insert(1, 'Item_Weight', result)

In [None]:
test.isnull().sum()

In [None]:
j = test['Item_Weight'].isnull()
rows = []
for i,k in zip(j.index, j.values):
    if k == True:
        print("Row Label:",i)
        print(test.loc[i])
        rows.append(i)
        print("")

In [None]:
print(rows)

In [None]:
for i in rows:
    it = test.loc[i]['Item_Type']
    avg = test[test['Item_Type'] == it]['Item_Weight'].mean(skipna = True)
    test.loc[i] = test.loc[i].fillna(avg)

In [None]:
test.isnull().sum()

In [None]:
test[test['Item_Visibility'] == 0].shape[0]

In [None]:
test['Item_Visibility'].replace(0.0, np.NaN, inplace = True)

In [None]:
test.isnull().sum()

### Missing values handling for Item Visibility: 

In [None]:
iwvalues = []; iwindex = []

for j in test.Item_Identifier.unique():
    
    avg = test[test['Item_Identifier'] == j]['Item_Visibility'].mean(skipna = True)
    
    result = test[test['Item_Identifier'] == j]['Item_Visibility'].fillna(avg)
    
    for k in result.values:
        iwvalues.append(k)
    for k in result.index:
        iwindex.append(k)

In [None]:
result = pd.Series(iwvalues, index = iwindex)
result.sort_index(inplace = True)
result.head()

In [None]:
for i in enumerate(test.columns):
    print(i)

In [None]:
test.drop('Item_Visibility', axis=1, inplace=True)

In [None]:
test.insert(3, 'Item_Visibility', result)

In [None]:
test.isnull().sum()

In [None]:
j = test['Item_Visibility'].isnull()
rows = []
for i,k in zip(j.index, j.values):
    if k == True:
        print("Row Label:",i)
        print(test.loc[i])
        rows.append(i)
        print("")

In [None]:
print(rows)

### Missing values handling for Outlet Size: 

In [None]:
for i in test['Outlet_Identifier'].unique():
    print("Outlet_Identifier:",i,test[test['Outlet_Identifier'] == i]['Outlet_Size'].unique())

In [None]:
r1 = test[test['Outlet_Identifier'] == 'OUT010']['Outlet_Size'].fillna("Small")

In [None]:
r2 = test[test['Outlet_Identifier'] == 'OUT017']['Outlet_Size'].fillna("Small")

In [None]:
r3 = test[test['Outlet_Identifier'] == 'OUT045']['Outlet_Size'].fillna("Small")

In [None]:
result = pd.concat([r1, r2, r3], axis=0)

In [None]:
result.sort_index(inplace=True)

In [None]:
result.head()

In [None]:
osindex = []; osvalues = []
for i, j in test['Outlet_Size'].isnull().items():
    if j == False:
        osindex.append(i)
        osvalues.append(test['Outlet_Size'][i])

In [None]:
remaining_items = pd.Series(osvalues, index=osindex)
remaining_items.head()

In [None]:
result = pd.concat([result, remaining_items], axis=0)

In [None]:
result.sort_index(inplace=True)

In [None]:
result.head()

In [None]:
test.drop('Outlet_Size', axis=1, inplace=True)

In [None]:
test.insert(8, 'Outlet_Size', result)

In [None]:
test.isnull().sum()

### Missing values handling for Remaining Item Visibility:

In [None]:
for i in rows:
    it = test.loc[i]['Item_Type']
    avg = test[test['Item_Type'] == it]['Item_Visibility'].mean(skipna = True)
    test.loc[i] = test.loc[i].fillna(avg)

In [None]:
test.isnull().sum()

### Create a new column Age of Outlet

In [None]:
import datetime

In [None]:
today = datetime.date.today()

In [None]:
today

In [None]:
today.year

In [None]:
test['Age_of_Outlet'] = test['Outlet_Establishment_Year'].apply(lambda i : today.year - i)

In [None]:
test.head()

### Drop unnecessary columns

In [None]:
test.drop(['Item_Identifier', 'Outlet_Identifier', 'Outlet_Establishment_Year'], axis=1, inplace=True)

In [None]:
test.head()

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label = LabelEncoder()

In [None]:
cat_features = test.describe(include=['object']).columns.tolist()

In [None]:
cat_features

In [None]:
for i in cat_features:
    test[i] = label.fit_transform(test[i])

In [None]:
test.head()

### One Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
for i in enumerate(test.columns):
    print(i)

In [None]:
ohc = OneHotEncoder(categorical_features=[3,5,6,7])

In [None]:
X = ohc.fit_transform(test)

In [None]:
X = X.toarray()

In [None]:
X

### Make Predictions using XGBoost Model

In [None]:
y_predict = model_xg.predict(X)

### Collect item identifiers for all items

In [None]:
df = pd.read_csv("../Big_Mart_Sales/Test.csv")
df.head()

In [None]:
itemids = df['Item_Identifier'].tolist()
outletids = df['Outlet_Identifier'].tolist()

### Final Report

In [None]:
report = pd.DataFrame({"Item_Identifier": itemids, 
                       "Outlet_Identifier": outletids,
                       "Item_Outlet_Sales": y_predict})

In [None]:
report.head(10)

### Create submission file in CSV format

In [None]:
report.to_csv("Submission.csv")