# Problem Set IV - Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.svm import LinearSVR
from sklearn.preprocessing import binarize, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import RFE, SelectKBest, f_classif, SelectPercentile
from sklearn.impute import SimpleImputer

In [2]:
# Load data into pandas.DataFrame from Avocado Dataset.csv
avdata = pd.read_csv("./Avocado.csv", index_col= 0)
avdata

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Hierarchy
0,27-12-2015,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany,Old
1,20-12-2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany,Old
2,13-12-2015,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany,Old
3,06-12-2015,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany,Old
4,29-11-2015,1.29,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany,Old
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18245,28-01-2018,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico,Recent
18246,21-01-2018,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico,Recent
18247,14-01-2018,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico,Recent
18248,07-01-2018,1.62,17489.58,2894.77,2356.13,224.53,12014.15,11988.14,26.01,0.0,organic,2018,WestTexNewMexico,Recent


In [3]:
# Load data into pandas.DataFrame from Avocado Dataset.csv
trdata = pd.read_csv("./Trail.csv")
trdata

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,20-12-2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
1,13-12-2015,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
2,06-12-2015,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
3,22-11-2015,,55979.78,1184.27,48067.99,43.61,6683.91,6556.47,127.44,0.0,conventional,2015,Albany
4,20-12-2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,20-09-2015,,498640.23,4376.74,398673.48,418.46,95171.55,91612.66,3558.89,0.0,conventional,2015,Boston
198,13-09-2015,,655682.95,5422.29,560792.23,353.77,89114.66,84843.55,4271.11,0.0,conventional,2015,Boston
199,06-09-2015,,577774.74,4237.44,477867.83,496.62,95172.85,94558.41,614.44,0.0,conventional,2015,Boston
200,30-08-2015,,526664.87,4177.03,438502.90,554.04,83430.90,83242.01,188.89,0.0,conventional,2015,Boston


In [4]:
avdata.dtypes

Date             object
AveragePrice    float64
Total Volume    float64
4046            float64
4225            float64
4770            float64
Total Bags      float64
Small Bags      float64
Large Bags      float64
XLarge Bags     float64
type             object
year              int64
region           object
Hierarchy        object
dtype: object

## 1.
Select a subset of relevant attributes from the given dataset that are necessary to know about thetotal volume of avocados with product lookup codes (PLU) 4046, 4225, 4770) which are of organic type. (Use AVOCADO dataset)


In [5]:
avdata['type'] = [x.strip() for x in avdata['type'].astype(str)]

In [6]:
orgData = pd.DataFrame([avdata.iloc[i] for i in avdata.index if avdata['type'].iloc[i] == 'organic'])

In [7]:
orgData[['Total Volume', "4046", "4225", "4770"]]

Unnamed: 0,Total Volume,4046,4225,4770
9126,989.55,8.16,88.59,0.00
9127,1163.03,30.24,172.14,0.00
9128,995.96,10.44,178.70,0.00
9129,1158.42,90.29,104.18,0.00
9130,831.69,0.00,94.73,0.00
...,...,...,...,...
18245,13888.04,1191.70,3431.50,0.00
18246,13766.76,1191.92,2452.79,727.94
18247,16205.22,1527.63,2981.04,727.01
18248,17489.58,2894.77,2356.13,224.53


## 2. 
Discard all duplicate entries in the given dataset and fill all the missing values in the attribute “AveragePrice” as 1.25. Also print the size of the dataset before and after removing duplicates. (Use Trail dataset)


In [8]:
trdata.shape

(202, 13)

In [9]:
trdata = trdata.drop_duplicates()

In [10]:
trdata.shape

(195, 13)

In [11]:
is_number = lambda x: re.match(r'[+-]?[0-9]+\.?[0-9]*', str(x)) is not None

In [44]:
defaultAvgPrice = 1.25
for i in range(len(trdata)):
    if is_number(trdata.iloc[i]['AveragePrice']) == False:
        trdata.loc[i, 'AveragePrice'] = defaultAvgPrice
trdata

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,20-12-2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
1,13-12-2015,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
2,06-12-2015,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany
3,22-11-2015,1.25,55979.78,1184.27,48067.99,43.61,6683.91,6556.47,127.44,0.0,conventional,2015,Albany
5,13-12-2015,1.21,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,organic,2015,xxxx
...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,20-09-2015,,498640.23,4376.74,398673.48,418.46,95171.55,91612.66,3558.89,0.0,conventional,2015,Boston
198,13-09-2015,,655682.95,5422.29,560792.23,353.77,89114.66,84843.55,4271.11,0.0,conventional,2015,Boston
199,06-09-2015,,577774.74,4237.44,477867.83,496.62,95172.85,94558.41,614.44,0.0,conventional,2015,Boston
200,30-08-2015,,526664.87,4177.03,438502.90,554.04,83430.90,83242.01,188.89,0.0,conventional,2015,Boston


## 3. 
Binarize the attribute “Year”. Set the threshold above 2016 and print it without truncation. (Use AVOCADO dataset)

In [13]:
binarize(avdata['year'].values.reshape(-1, 1), 2016)

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

## 4. 
Transform all categorical attributes in the dataset AVOCADO using Integer Encoding.

In [14]:
OrdinalEncoder(avdata)

OrdinalEncoder(categories=             Date  AveragePrice  Total Volume     4046       4225    4770  \
0      27-12-2015          1.33      64236.62  1036.74   54454.85   48.16   
1      20-12-2015          1.35      54876.98   674.28   44638.81   58.33   
2      13-12-2015          0.93     118220.22   794.70  109149.67  130.50   
3      06-12-2015          1.08      78992.15  1132.00   71976.41   72.58   
4      29-11-2015          1.29      51039.60   941.48   43838.39   75.78   
...           ...           ...           ...      ...        ...     ...   
18245  28-01-2018          1.71      13888.04  1191.70    3431.50    0.00   
182...
18247    10969.54    10919.54       50.00          0.0       organic  2018   
18248    12014.15    11988.14       26.01          0.0       organic  2018   
18249    12341.48    12114.81      226.67          0.0       organic  2018   

                 region Hierarchy  
0                Albany       Old  
1                Albany       Old  
2       

## 5. 
Transform the attribute = “Region” in the given dataset AVOCADO using One-Hot Encoding.

In [15]:
OneHotEncoder(avdata['region'])

OneHotEncoder(categories=0                  Albany
1                  Albany
2                  Albany
3                  Albany
4                  Albany
               ...       
18245    WestTexNewMexico
18246    WestTexNewMexico
18247    WestTexNewMexico
18248    WestTexNewMexico
18249    WestTexNewMexico
Name: region, Length: 18250, dtype: object,
              drop=None, dtype=<class 'numpy.float64'>, handle_unknown='error',
              sparse=True)

## 6. 
Ignore the tuples that hold missing values and print the subset of data from AVOCADO dataset.

In [16]:
avdata.dropna()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Hierarchy
0,27-12-2015,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany,Old
1,20-12-2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany,Old
2,13-12-2015,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany,Old
3,06-12-2015,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany,Old
4,29-11-2015,1.29,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany,Old
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18245,28-01-2018,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico,Recent
18246,21-01-2018,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico,Recent
18247,14-01-2018,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico,Recent
18248,07-01-2018,1.62,17489.58,2894.77,2356.13,224.53,12014.15,11988.14,26.01,0.0,organic,2018,WestTexNewMexico,Recent


## 7. 
Drop the attribute that has high nullity as it facilitates efficient prediction. (Use AVOCADO dataset)

#### Nullity
The nullity taken here is the count of *np.nan* in the column, my doing *np.mean()* on this is more like counting the **True** in the returned array and the dividing by the number of rows, hence the complete operation calculates the percentage of *np.nan* in the column.

In [17]:
nullity_threshhold = .1
avdata.loc[:, avdata.isnull().mean() < nullity_threshhold] 
# the data already has been preprocessed and hence should result in all rows as output

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region,Hierarchy
0,27-12-2015,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany,Old
1,20-12-2015,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany,Old
2,13-12-2015,0.93,118220.22,794.70,109149.67,130.50,8145.35,8042.21,103.14,0.0,conventional,2015,Albany,Old
3,06-12-2015,1.08,78992.15,1132.00,71976.41,72.58,5811.16,5677.40,133.76,0.0,conventional,2015,Albany,Old
4,29-11-2015,1.29,51039.60,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany,Old
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18245,28-01-2018,1.71,13888.04,1191.70,3431.50,0.00,9264.84,8940.04,324.80,0.0,organic,2018,WestTexNewMexico,Recent
18246,21-01-2018,1.87,13766.76,1191.92,2452.79,727.94,9394.11,9351.80,42.31,0.0,organic,2018,WestTexNewMexico,Recent
18247,14-01-2018,1.93,16205.22,1527.63,2981.04,727.01,10969.54,10919.54,50.00,0.0,organic,2018,WestTexNewMexico,Recent
18248,07-01-2018,1.62,17489.58,2894.77,2356.13,224.53,12014.15,11988.14,26.01,0.0,organic,2018,WestTexNewMexico,Recent


## 8. 
Study the entire dataset and report the complete statistical summary about the data (Use AVOCADO dataset)
    
(For the below exercises, you are free to choose an appropriate data set as merited by the problem statements)

- Dimension of the dataset


In [18]:
avdata.shape

(18250, 14)

- Most frequently occurring value under every attribute.


In [19]:
print(*[(x, avdata[x].value_counts().idxmax()) for x in avdata], sep= '\n')

('Date', '18-03-2018')
('AveragePrice', 1.15)
('Total Volume', 19634.24)
('4046', 0.0)
('4225', 0.0)
('4770', 0.0)
('Total Bags', 0.0)
('Small Bags', 0.0)
('Large Bags', 0.0)
('XLarge Bags', 0.0)
('type', 'conventional')
('year', 2017)
('region', 'Syracuse')
('Hierarchy', 'Old')


- Datatype of every attribute


In [20]:
avdata.dtypes

Date             object
AveragePrice    float64
Total Volume    float64
4046            float64
4225            float64
4770            float64
Total Bags      float64
Small Bags      float64
Large Bags      float64
XLarge Bags     float64
type             object
year              int64
region           object
Hierarchy        object
dtype: object

- Count


In [21]:
avdata.count()

Date            18250
AveragePrice    18250
Total Volume    18250
4046            18250
4225            18250
4770            18250
Total Bags      18250
Small Bags      18250
Large Bags      18250
XLarge Bags     18250
type            18250
year            18250
region          18250
Hierarchy       18250
dtype: int64

- Mean


In [22]:
avdata.mean()

AveragePrice         1.406852
Total Volume    850598.273413
4046            292992.481896
4225            295138.477670
4770             22838.484500
Total Bags      239626.747390
Small Bags      182185.367250
Large Bags       54335.123135
XLarge Bags       3106.256292
year              2016.148000
dtype: float64

- Standard Deviation


In [23]:
avdata.std()

AveragePrice    4.024822e-01
Total Volume    3.453456e+06
4046            1.264956e+06
4225            1.204089e+06
4770            1.074613e+05
Total Bags      9.862168e+05
Small Bags      7.461591e+05
Large Bags      2.439596e+05
XLarge Bags     1.769242e+04
year            9.400127e-01
dtype: float64

- Minimum Value


In [24]:
avdata.min()

Date              01-01-2017
AveragePrice            0.44
Total Volume           84.56
4046                       0
4225                       0
4770                       0
Total Bags                 0
Small Bags                 0
Large Bags                 0
XLarge Bags                0
type            conventional
year                    2015
region                Albany
Hierarchy                New
dtype: object

- Maximum value

In [25]:
avdata.max()

Date                  31-12-2017
AveragePrice                3.25
Total Volume         6.25056e+07
4046                 2.27436e+07
4225                 2.04706e+07
4770                 2.54644e+06
Total Bags           1.93731e+07
Small Bags           1.33846e+07
Large Bags            5.7191e+06
XLarge Bags               551694
type                     organic
year                        2018
region          WestTexNewMexico
Hierarchy                 Recent
dtype: object

- 25% (Lower Quartile)
- Median i.e. 50%
- 75% (Upper Quartile)


In [26]:
avdata.quantile([0.25,0.5,0.75])

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
0.25,1.1,10839.6275,854.21,3008.0975,0.0,5089.0825,2850.3225,127.58,0.0,2015.0
0.5,1.37,107365.505,8643.2,29058.875,184.975,39741.18,26351.615,2647.27,0.0,2016.0
0.75,1.66,432952.665,111008.7125,150166.335,6242.055,110781.115,83336.21,22018.275,132.4325,2017.0


- Find whether the class distribution of dataset is imbalanced. (Note: Fix the class label as “Type” in the given dataset)



In [27]:
avdata.type.value_counts()
# count doesn't vary too much wrt type 
# hence we can say that the ditribution data is not imbalanced

conventional    9126
organic         9124
Name: type, dtype: int64

- Correlation matrix


In [28]:
avdata.corr()

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
AveragePrice,1.0,-0.193116,-0.208891,-0.172971,-0.179811,-0.177532,-0.175129,-0.173481,-0.118026,0.090644
Total Volume,-0.193116,1.0,0.977863,0.974181,0.872203,0.963047,0.967238,0.88064,0.747158,0.017165
4046,-0.208891,0.977863,1.0,0.92611,0.83339,0.920057,0.92528,0.838645,0.699378,0.003328
4225,-0.172971,0.974181,0.92611,1.0,0.887855,0.905788,0.916031,0.810016,0.688809,-0.009584
4770,-0.179811,0.872203,0.83339,0.887855,1.0,0.792315,0.802733,0.698472,0.679862,-0.03655
Total Bags,-0.177532,0.963047,0.920057,0.905788,0.792315,1.0,0.994335,0.943009,0.804233,0.07152
Small Bags,-0.175129,0.967238,0.92528,0.916031,0.802733,0.994335,1.0,0.902589,0.806845,0.063883
Large Bags,-0.173481,0.88064,0.838645,0.810016,0.698472,0.943009,0.902589,1.0,0.710859,0.087858
XLarge Bags,-0.118026,0.747158,0.699378,0.688809,0.679862,0.804233,0.806845,0.710859,1.0,0.081005
year,0.090644,0.017165,0.003328,-0.009584,-0.03655,0.07152,0.063883,0.087858,0.081005,1.0


- Skewness of every attribute.


In [29]:
avdata.skew()

AveragePrice     0.575674
Total Volume     9.007930
4046             8.648456
4225             8.942706
4770            10.159671
Total Bags       9.756334
Small Bags       9.540917
Large Bags       9.796719
XLarge Bags     13.140106
year             0.215371
dtype: float64

### Data Summary
The above can also be done by a simple command used.

In [30]:
avdata.describe()

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,year
count,18250.0,18250.0,18250.0,18250.0,18250.0,18250.0,18250.0,18250.0,18250.0,18250.0
mean,1.406852,850598.3,292992.5,295138.5,22838.48,239626.7,182185.4,54335.12,3106.256292,2016.148
std,0.402482,3453456.0,1264956.0,1204089.0,107461.3,986216.8,746159.1,243959.6,17692.424825,0.940013
min,0.44,84.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0
25%,1.1,10839.63,854.21,3008.098,0.0,5089.083,2850.323,127.58,0.0,2015.0
50%,1.37,107365.5,8643.2,29058.88,184.975,39741.18,26351.61,2647.27,0.0,2016.0
75%,1.66,432952.7,111008.7,150166.3,6242.055,110781.1,83336.21,22018.28,132.4325,2017.0
max,3.25,62505650.0,22743620.0,20470570.0,2546439.0,19373130.0,13384590.0,5719097.0,551693.65,2018.0


## 9. 
Test drive the use of Gini Index, Information Gain, Entropy and other measures that are supported in your platform, performing the role of data selection.


### Recursive Feature Elimination (RFE) 

"*the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through any specific attribute or callable. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.*" - [***sklearn.feature_selection.RFE***](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html)

Estimator- "*In machine learning, an estimator is an equation for picking the “best,” or most likely accurate, data model based upon observations in realty.*"- [***deepai.org***](https://deepai.org/machine-learning-glossary-and-terms/estimator#:~:text=In%20machine%20learning%2C%20an%20estimator,based%20upon%20observations%20in%20realty.&text=This%20estimate%20is%20then%20inserted,determine%20what%20action%20to%20take.)

In [31]:
avdata.dtypes, avdata.columns

(Date             object
 AveragePrice    float64
 Total Volume    float64
 4046            float64
 4225            float64
 4770            float64
 Total Bags      float64
 Small Bags      float64
 Large Bags      float64
 XLarge Bags     float64
 type             object
 year              int64
 region           object
 Hierarchy        object
 dtype: object,
 Index(['Date', 'AveragePrice', 'Total Volume', '4046', '4225', '4770',
        'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
        'region', 'Hierarchy'],
       dtype='object'))

In [32]:
estimator = LinearSVR(max_iter= 1000)
selector = RFE(estimator, n_features_to_select= 2, step= 1, verbose= True)
col_list = ['AveragePrice', 'Total Volume', '4046', '4225', '4770']
X = avdata[col_list]
y = avdata['Total Bags']
selector.fit(X, y)
selector.support_

Fitting estimator with 5 features.




Fitting estimator with 4 features.




Fitting estimator with 3 features.




array([False,  True, False,  True, False])

### Select K-Best

"*It takes as a parameter a score function, which must be applicable to a pair (X, y). The score function must return an array of scores, one for each feature X[:,i] of X (additionally, it can also return p-values, but these are neither needed nor required). SelectKBest then simply retains the first k features of X with the highest scores.*" 

In [33]:
selector = SelectKBest(f_classif, k=2)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores 

  This is separate from the ipykernel package so we can avoid doing imports until


array([  7.01075223, 272.289062  ,          inf, 213.89923231,
       252.23716015])

In [34]:
selector = SelectPercentile(f_classif, percentile= 10)
selector.fit_transform(X, y).shape

(18250, 1)

## 10.
Test drive the implementation support in your platform of choice for data preprocessing phases such as cleaning, selection, transformation, integration in addition to the earlier exercises.

### Data Cleaning
#### Class Used:
*sklearn.impute.SimpleImputer*
#### Strategy Used:
replacing missing values (most of them being numeric type and hence *np.nan*) by mean of the data column.

In [35]:
imputer = SimpleImputer(missing_values= np.nan,strategy= "mean") 
imputer.fit(avdata['Total Volume'].values.reshape(-1, 1))
imputer.fit(avdata['AveragePrice'].values.reshape(-1, 1))

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

### Data Normalization
#### *sklearn.preprocessing.StandardScaler* - Standard Scaler Sklearn
"*The idea behind StandardScaler is that it will transform your data such that its distribution will have a mean value 0 and standard deviation of 1.
In case of multivariate data, this is done feature-wise (in other words independently for each column of the data).
Given the distribution of the data, each value in the dataset will have the mean value subtracted, and then divided by the standard deviation of the whole dataset (or feature in the multivariate case).*[***Standard Scaler Scikit-Learn***](https://stackoverflow.com/questions/40758562/can-anyone-explain-me-standardscaler)

---
"*StandardScaler() will normalize the features i.e. each column of X, INDIVIDUALLY, so that each column/feature/variable will have μ = 0 and σ = 1*" [***Standard Scaler Scikit-Learn***](https://stackoverflow.com/questions/40758562/can-anyone-explain-me-standardscaler)

In [36]:
avdata.dtypes, avdata.columns

(Date             object
 AveragePrice    float64
 Total Volume    float64
 4046            float64
 4225            float64
 4770            float64
 Total Bags      float64
 Small Bags      float64
 Large Bags      float64
 XLarge Bags     float64
 type             object
 year              int64
 region           object
 Hierarchy        object
 dtype: object,
 Index(['Date', 'AveragePrice', 'Total Volume', '4046', '4225', '4770',
        'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
        'region', 'Hierarchy'],
       dtype='object'))

In [37]:
scaler = StandardScaler()
num_data  = ['AveragePrice', 'Total Volume', '4046', '4225', '4770',
       'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'year']
scaler.fit(avdata[num_data])

StandardScaler(copy=True, with_mean=True, with_std=True)

In [38]:
[print(num_data[i], ":\t\t", scaler.mean_[i]) for i in range(len(num_data))]

AveragePrice :		 1.4068523761338763
Total Volume :		 850598.2734126027
4046 :		 292992.48189643834
4225 :		 295138.477670137
4770 :		 22838.484500273968
Total Bags :		 239626.74739013697
Small Bags :		 182185.36725041096
Large Bags :		 54335.123135342474
XLarge Bags :		 3106.256292054794
year :		 2016.148


[None, None, None, None, None, None, None, None, None, None]

### Note
The Scikit-Learn doesn't actually contain a for reading and writing data, hence the data transformation is not actually done using *sklearn*. Hence *pandas* is used for this purpose.

### Other Operations
The operations like mean, dimensions, etc. are done in *pandas* already.