In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# importing all required libraries
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.graphics.gofplots import qqplot

import random as python_random 
from sklearn.decomposition import PCA,FastICA
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,RepeatedKFold,ShuffleSplit,StratifiedShuffleSplit
from scipy.stats import mode
from sklearn.feature_selection import SelectFromModel,RFECV,RFE
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.metrics import f1_score,roc_auc_score,mean_squared_error,accuracy_score,log_loss,classification_report
import random as python_random 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression,ElasticNet,RidgeClassifier,Lasso
from sklearn.ensemble import (IsolationForest,RandomForestClassifier,BaggingClassifier,ExtraTreesClassifier,GradientBoostingClassifier,AdaBoostClassifier,VotingClassifier,StackingClassifier)
from catboost import CatBoostClassifier,Pool
from xgboost import XGBRFClassifier,XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import RandomOverSampler,ADASYN,SMOTE,SVMSMOTE,KMeansSMOTE,SMOTENC
from imblearn.ensemble import BalancedBaggingClassifier,BalancedRandomForestClassifier,EasyEnsembleClassifier,RUSBoostClassifier
from imblearn.under_sampling import RandomUnderSampler,CondensedNearestNeighbour
from category_encoders import (BinaryEncoder,CatBoostEncoder,CountEncoder,HashingEncoder,TargetEncoder,
JamesSteinEncoder,LeaveOneOutEncoder,OneHotEncoder,PolynomialEncoder,WOEEncoder)

In [3]:
# Reading dataset
df  = pd.read_csv('/kaggle/input/cambodia-price-data-collection/khm-crowdsourcedpdcpilot02_final_obs_all_clean.csv')

In [4]:
#a peek at the dataset
df.head()

In [5]:
# Checking the shape of the dataset

df.shape

In [6]:
# Proper formating for date
df.created_at = pd.to_datetime(df.created_at)

In [7]:
# Checking the basic information about the dataset.

df.info()

In [8]:
# Number of unique values present in each feature column

df.nunique()

In [9]:
# Removing duplicated columns and columns with single values
single_val_col = [col for col in df.columns if df[col].nunique() == 1]
to_remove = ['item_code','bh_code'] + single_val_col

df = df.drop(to_remove,axis =1)

In [10]:
# Number of missing values present in each feature column
df.isnull().sum()

In [11]:
sns.heatmap(df.isna())

In [12]:
df.head()

In [13]:
# Statistical Measure of the dataset

df.describe()

In [14]:
# Correlation matrix

plt.figure(figsize=(16,10))
sns.heatmap(df.corr(), annot=True)
plt.show()

### Univariate Analysis

   #### 1. Numeric Columns

In [15]:

fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['price'], ax=ax1 , color ='red')
ax1.set(title='price distribution')
qqplot(df['price'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

In [16]:
# Checking and visualizing the type of distribution of a feature column

fig, (ax1,ax2) = plt.subplots(ncols=2, figsize=(15,4))
sns.distplot(df['normalized_price'], ax=ax1 , color ='red')
ax1.set(title='normalized_price distribution')
qqplot(df['normalized_price'], ax=ax2, line='s')
ax2.set(title='Quantile quantile plot')

In [17]:
# Getting the numeric columns 
num_col = df.select_dtypes(exclude = 'O').columns

In [18]:
num_col

In [19]:
# Checking and visualizing the type of distribution and variation of  numeric columns with time 
for col in num_col[1:]:
    fig, (ax1,ax2,ax3) = plt.subplots(ncols=3, figsize=(25,5))
    sns.distplot(df[col], ax=ax1 , color ='red')
    ax1.set(title=f'{col} distribution')
    sns.scatterplot(df.created_at,df[col], ax=ax2 )
    ax2.set(title=f"Graph of {col} against date")
    qqplot(df[col], ax=ax3, line='s')
    ax3.set(title='Quantile quantile plot')

In [20]:
# Checking skewness value 
# If value lies between -0.5 to 0.5  then it is normal otherwise skewed

skew_val = df.skew().sort_values(ascending=False)
skew_val

In [21]:
#  visualizing  skewness value 
plt.figure(figsize=(16,10))
skew_val.plot(kind = 'bar')

   #### 2. Categorical Columns

In [42]:
fig =  px.pie (df, names = "quantity", hole = 0.4, template = "gridon")
fig.show ()

In [43]:
fig =  px.pie (df, names = "currency", hole = 0.4, template = "gridon")
fig.show ()

In [44]:
fig =  px.pie (df, names = "units", hole = 0.4, template = "gridon")
fig.show ()

In [45]:
fig =  px.pie (df, names = "store_type", hole = 0.4, template = "gridon")
fig.show ()

In [46]:
fig =  px.pie (df, names = "l1", hole = 0.4, template = "gridon")
fig.show ()

In [47]:
fig =  px.pie (df, names = "l2", hole = 0.4, template = "gridon")
fig.show ()

In [48]:
fig =  px.pie (df, names = "city_radius", hole = 0.4, template = "gridon")
fig.show ()

### Bivariate Analysis

In [36]:
# Checking the effect of quantity and units on price
fig = px.bar(df, x='quantity', y='price', color="units")
fig.show()

In [23]:
# Checking the effect of city radius and units on price
fig = px.bar(df, x='city_radius', y='price', color="units")
fig.show()

In [25]:
# Checking the effect of currency and units on price
fig = px.bar(df, x='currency', y='price', color="units")
fig.show()

In [27]:
# Checking the effect of l1_geo and city_radius on price
fig = px.bar(df, x='l1_geo', y='price', color="city_radius")
fig.show()

In [28]:
# Checking the effect of l1_geo and units on price
fig = px.bar(df, x='l1_geo', y='price', color="units")
fig.show()

In [29]:
# Checking the effect of store_type and units on price
fig = px.bar(df, x='store_type', y='price', color="units")
fig.show()

In [30]:
# Checking the effect of reference_unit_of_measurement and units on price
fig = px.bar(df, x='reference_unit_of_measurement', y='price', color="units")
fig.show()

In [31]:
fig = px.bar(df, x='l2_geo', y='price', color="city_radius")
fig.show()