In [12]:
import pandas as pd
import numpy as np

## Investigation and definition

In [15]:
nyc_sales = pd.read_csv('nyc-rolling-sales_twentieth.csv')
pd.set_option('display.max_columns', 500)

In [16]:
nyc_sales.mode()

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1.0,UPPER EAST SIDE (59-79),13 CONDOS - ELEVATOR APARTMENTS,2,16.0,1.0,,R4,50 WEST STREET,,10023.0,0.0,0.0,1.0,-,-,0.0,2.0,R4,-,2016-09-22 00:00:00
1,5,,,,,,,,,,,,,,,,,,,,,
2,6,,,,,,,,,,,,,,,,,,,,,
3,7,,,,,,,,,,,,,,,,,,,,,
4,8,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16904,16908,,,,,,,,,,,,,,,,,,,,,
16905,16909,,,,,,,,,,,,,,,,,,,,,
16906,16910,,,,,,,,,,,,,,,,,,,,,
16907,16911,,,,,,,,,,,,,,,,,,,,,


In [17]:
nyc_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16909 entries, 0 to 16908
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Unnamed: 0                      16909 non-null  int64 
 1   BOROUGH                         16909 non-null  int64 
 2   NEIGHBORHOOD                    16909 non-null  object
 3   BUILDING CLASS CATEGORY         16909 non-null  object
 4   TAX CLASS AT PRESENT            16909 non-null  object
 5   BLOCK                           16909 non-null  int64 
 6   LOT                             16909 non-null  int64 
 7   EASE-MENT                       16909 non-null  object
 8   BUILDING CLASS AT PRESENT       16909 non-null  object
 9   ADDRESS                         16909 non-null  object
 10  APARTMENT NUMBER                16909 non-null  object
 11  ZIP CODE                        16909 non-null  int64 
 12  RESIDENTIAL UNITS               16909 non-null

First impressions:
* - We have a column without any value, 
* - Important numeric features in Object mode, lets convert then in a numeric 
* - The feature BOROUGH is in a number type, lets change it to the neighborhood name 
* - Categoric features in INT type, lets convert then in object type

In [18]:
nyc_sales['SALE PRICE'] = pd.to_numeric(nyc_sales['SALE PRICE'], errors = 'coerce')
nyc_sales['SALE PRICE']
# erros ='coerce' -> when it has an error it will transform into NaN, in this case it does it in the - values

0        6625000.0
1              NaN
2              NaN
3        3936272.0
4        8000000.0
           ...    
16904          NaN
16905     712500.0
16906     740000.0
16907    1800000.0
16908          NaN
Name: SALE PRICE, Length: 16909, dtype: float64

In [19]:
# let's tranform all the others features that must be numeric type
for n in ['GROSS SQUARE FEET', 'LAND SQUARE FEET', 'APARTMENT NUMBER']:
    nyc_sales[n] = pd.to_numeric(nyc_sales[n] , errors = 'coerce')

In [20]:
# let's transform all the categoric features in object type
for n in ['BLOCK' , 'LOT' , 'TAX CLASS AT TIME OF SALE']:
    nyc_sales[n] = nyc_sales[n].astype('object')

In [21]:
# For the BOROUGH variables, let's follow the recommendations of the "documentation"
nyc_sales['BOROUGH'] = nyc_sales['BOROUGH'].map({
                                  1: 'Manhatan',
                                  2: 'Bronx' , 
                                  3: 'Brooklyn' , 
                                  4: 'Queens' ,
                                  5: 'State Island'})

In [22]:
nyc_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16909 entries, 0 to 16908
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      16909 non-null  int64  
 1   BOROUGH                         16909 non-null  object 
 2   NEIGHBORHOOD                    16909 non-null  object 
 3   BUILDING CLASS CATEGORY         16909 non-null  object 
 4   TAX CLASS AT PRESENT            16909 non-null  object 
 5   BLOCK                           16909 non-null  object 
 6   LOT                             16909 non-null  object 
 7   EASE-MENT                       16909 non-null  object 
 8   BUILDING CLASS AT PRESENT       16909 non-null  object 
 9   ADDRESS                         16909 non-null  object 
 10  APARTMENT NUMBER                1156 non-null   float64
 11  ZIP CODE                        16909 non-null  int64  
 12  RESIDENTIAL UNITS               

In [23]:
# Finally, we have the SALE DATE column which is a date, but it is a string. Let's convert it to date
nyc_sales['SALE DATE'] = pd.to_datetime(nyc_sales['SALE DATE'], format= '%Y-%m-%d')
# format= '%Y-%m-%d' -> define the date as Year - month - day

## Investigation of inconsistencies

* Dates out of range
* Feet² or sales price zeroed
* Negative values of price or feet²

In [24]:
# checking if we have some date after today
nyc_sales['SALE DATE'].max()

Timestamp('2017-08-31 00:00:00')

In [25]:
(nyc_sales['SALE DATE']> '2021-06-08').sum()

0

In [26]:
# conclusion : we don´t have any date after the current date 

In [27]:
# check if we have area less or igual to 0 
for n in ['SALE PRICE' , 'LAND SQUARE FEET' , 'GROSS SQUARE FEET']:
    if (nyc_sales[n] <= 0 ).sum() > 0:
        print(f'{n} have some inconsistencies')
    else:
        print(f'{n}  OK')

SALE PRICE está OK
LAND SQUARE FEET está OK
GROSS SQUARE FEET está OK


### Missing values


Missing data can have several sources:
* Insertion errors (bank error)
* human errors
* Operation errors (sensors)
* Equipment Faults

Furthermore, they can be classified as follows

* Missing Completely At Random (MCAR): The data has nothing to do with any data in the dataset. That is, the probability that the missing data exists is not associated with the absence or presence of other information or the column value itself.
* Missing At Random (MAR): The probability of a missing value occurring is associated with other information in the dataset.
* Missing Not at Random (MNAR): Missing data has to do with the value itself.

### Treatment

When missing data appears through a join in a table, we are almost always able to fill in the missing data with a value that makes sense. Example: Account withdrawal. Not all customers make withdrawals in a month. This way, we can fill in the missing amount in the cashout column for a customer who didn't cashout as 0.


Almost never the option to delete everything is the best, as some problems can arise, such as:

Introduction of Bias
loss of information

Some methods of handling missing data:

* deletion
* Filling with statistics (mode for categorical, median or mean for numeric)
* Population through a template (KNN, Decision Tree)
* Filling in with business rule
* Filling with the value above or bellow
* Filling with the most commun value in the column

Some good practices:

* Create Categorical Null for column (works for numeric and categorical)
* Don't drop as first option.

In [28]:
nyc_sales.shape[0]

16909

In [29]:
# find the Nulls
100*(nyc_sales.isnull().sum()/nyc_sales.shape[0])

Unnamed: 0                         0.000000
BOROUGH                            0.000000
NEIGHBORHOOD                       0.000000
BUILDING CLASS CATEGORY            0.000000
TAX CLASS AT PRESENT               0.000000
BLOCK                              0.000000
LOT                                0.000000
EASE-MENT                          0.000000
BUILDING CLASS AT PRESENT          0.000000
ADDRESS                            0.000000
APARTMENT NUMBER                  93.163404
ZIP CODE                           0.000000
RESIDENTIAL UNITS                  0.000000
COMMERCIAL UNITS                   0.000000
TOTAL UNITS                        0.000000
LAND SQUARE FEET                  89.762848
GROSS SQUARE FEET                 90.413389
YEAR BUILT                         0.000000
TAX CLASS AT TIME OF SALE          0.000000
BUILDING CLASS AT TIME OF SALE     0.000000
SALE PRICE                        20.781832
SALE DATE                          0.000000
dtype: float64


Treatment Options:

* Delete the Land Square Feet, Gross Square Feet and Apartment Number columns and add a Categorical Null column.
* Keep the column and categorize (ranges of values) by adding the Categorical Null

In [30]:
nyc_sales['IS_NULL_APARTMENT_NUMBER'] = False

In [31]:
nyc_sales.loc[nyc_sales['APARTMENT NUMBER'].isnull(), 'IS_NULL_APARTMENT_NUMBER'] = True

In [32]:
nyc_sales['IS_NULL_APARTMENT_NUMBER'].sum()/nyc_sales.shape[0]

0.9316340410432314

Let's create a column with a range of values and a categorical null, using the ```pd.cut``` function

In [33]:
nyc_sales['LAND SQUARE FEET RANGE'] = pd.cut(nyc_sales['LAND SQUARE FEET'] , bins = 5)
nyc_sales['LAND SQUARE FEET RANGE'] = nyc_sales['LAND SQUARE FEET RANGE'].astype('object')
nyc_sales['LAND SQUARE FEET RANGE'] = nyc_sales['LAND SQUARE FEET RANGE'].fillna('UNKNOW RANGE')

In [34]:
nyc_sales['GROSS SQUARE FEET RANGE'] = pd.cut(nyc_sales['GROSS SQUARE FEET'], bins=[0, 100000, 500000, 7500000, np.inf])

In [35]:
nyc_sales['GROSS SQUARE FEET RANGE'] = nyc_sales['GROSS SQUARE FEET RANGE'].astype('object')
nyc_sales['GROSS SQUARE FEET RANGE'] = nyc_sales['GROSS SQUARE FEET RANGE'].fillna('UNKNOW RANGE')

In [36]:
nyc_sales['GROSS SQUARE FEET RANGE']

0        (0.0, 100000.0]
1        (0.0, 100000.0]
2        (0.0, 100000.0]
3        (0.0, 100000.0]
4        (0.0, 100000.0]
              ...       
16904       UNKNOW RANGE
16905       UNKNOW RANGE
16906       UNKNOW RANGE
16907       UNKNOW RANGE
16908       UNKNOW RANGE
Name: GROSS SQUARE FEET RANGE, Length: 16909, dtype: object

In [37]:
nyc_sales

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,IS_NULL_APARTMENT_NUMBER,LAND SQUARE FEET RANGE,GROSS SQUARE FEET RANGE
0,4,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,10009,5,0,5,1633.0,6440.0,1900,2,C2,6625000.0,2017-07-19,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
1,5,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,,10009,28,3,31,4616.0,18690.0,1900,2,C7,,2016-12-14,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
2,6,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,,10009,16,1,17,2212.0,7803.0,1900,2,C7,,2016-12-09,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
3,7,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,,10009,10,0,10,2272.0,6794.0,1913,2,C4,3936272.0,2016-09-23,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
4,8,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,,10009,6,0,6,2369.0,4615.0,1900,2,C2,8000000.0,2016-11-17,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16904,16908,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89TH STREET, 9A",,10024,0,0,0,,,1925,2,D4,,2017-02-08,True,UNKNOW RANGE,UNKNOW RANGE
16905,16909,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89TH STREET, 10C",,10024,0,0,0,,,1925,2,D4,712500.0,2017-02-13,True,UNKNOW RANGE,UNKNOW RANGE
16906,16910,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89 ST, 10D",,10024,0,0,0,,,1925,2,D4,740000.0,2017-02-13,True,UNKNOW RANGE,UNKNOW RANGE
16907,16911,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89TH STREET, 7G",,10024,0,0,0,,,1925,2,D4,1800000.0,2017-04-27,True,UNKNOW RANGE,UNKNOW RANGE


In [38]:
# Another exemples

In [39]:
# creating a random dataframe
df_random = pd.DataFrame(np.random.randint(0,100 , size = (5,4)), columns = ['A' , 'B' , 'C' , 'D'])
df_random

Unnamed: 0,A,B,C,D
0,37,29,8,18
1,46,59,59,87
2,25,28,32,15
3,71,71,57,30
4,48,4,66,92


In [40]:
df_random.iloc[0:2, 2] = np.nan
df_random.iloc[[1,2], 0] = np.nan
df_random.iloc[[2, 3], 3] = np.nan

In [41]:
df_random

Unnamed: 0,A,B,C,D
0,37.0,29,,18.0
1,,59,,87.0
2,,28,32.0,
3,71.0,71,57.0,
4,48.0,4,66.0,92.0


In [42]:
df_random['A'].mean()

52.0

In [43]:
# using the mean to fill the NaN
df_random['A'] = df_random['A'].fillna(df_random['A'].mean())

In [44]:
df_random

Unnamed: 0,A,B,C,D
0,37.0,29,,18.0
1,52.0,59,,87.0
2,52.0,28,32.0,
3,71.0,71,57.0,
4,48.0,4,66.0,92.0


In [45]:
# Backward Fill
df_random['C'] = df_random['C'].fillna(method='bfill')

In [46]:
df_random

Unnamed: 0,A,B,C,D
0,37.0,29,32.0,18.0
1,52.0,59,32.0,87.0
2,52.0,28,32.0,
3,71.0,71,57.0,
4,48.0,4,66.0,92.0


In [47]:
# Forward fill
df_random['D'] = df_random['D'].fillna(method='ffill')

In [48]:
df_random

Unnamed: 0,A,B,C,D
0,37.0,29,32.0,18.0
1,52.0,59,32.0,87.0
2,52.0,28,32.0,87.0
3,71.0,71,57.0,87.0
4,48.0,4,66.0,92.0


# Analysis of Duplicates

Duplicates are records that are the same in their entirety or in a specific set of columns. In addition, we may also have equivalent and duplicate columns in our dataset that we don't want to keep as it creates redundancy. And in an eventual machine learning model training process this can lead to overfitting or overestimation of performance (in the case of duplicate rows)


To remove duplicates we use the ```drop_duplicates()``` function

In [49]:
# Without passing anything it observes if there is an exact match between any line and keeps only the first line it found
nyc_sales.drop_duplicates(keep='first')
# we can have keep=last (keep the last line) or keep = False (keep none)

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,IS_NULL_APARTMENT_NUMBER,LAND SQUARE FEET RANGE,GROSS SQUARE FEET RANGE
0,4,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,10009,5,0,5,1633.0,6440.0,1900,2,C2,6625000.0,2017-07-19,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
1,5,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,,10009,28,3,31,4616.0,18690.0,1900,2,C7,,2016-12-14,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
2,6,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,,10009,16,1,17,2212.0,7803.0,1900,2,C7,,2016-12-09,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
3,7,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,,10009,10,0,10,2272.0,6794.0,1913,2,C4,3936272.0,2016-09-23,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
4,8,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,,10009,6,0,6,2369.0,4615.0,1900,2,C2,8000000.0,2016-11-17,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16904,16908,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89TH STREET, 9A",,10024,0,0,0,,,1925,2,D4,,2017-02-08,True,UNKNOW RANGE,UNKNOW RANGE
16905,16909,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89TH STREET, 10C",,10024,0,0,0,,,1925,2,D4,712500.0,2017-02-13,True,UNKNOW RANGE,UNKNOW RANGE
16906,16910,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89 ST, 10D",,10024,0,0,0,,,1925,2,D4,740000.0,2017-02-13,True,UNKNOW RANGE,UNKNOW RANGE
16907,16911,Manhatan,UPPER WEST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1237,29,,D4,"201 WEST 89TH STREET, 7G",,10024,0,0,0,,,1925,2,D4,1800000.0,2017-04-27,True,UNKNOW RANGE,UNKNOW RANGE


In [50]:
# The subset parameter gives us the freedom to look for duplicates only in a set of columns
nyc_sales.drop_duplicates(subset= ['BOROUGH','NEIGHBORHOOD'])
# we can have keep=last (keep the last line) or keep = False (keep none)

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,IS_NULL_APARTMENT_NUMBER,LAND SQUARE FEET RANGE,GROSS SQUARE FEET RANGE
0,4,Manhatan,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,10009,5,0,5,1633.0,6440.0,1900,2,C2,6625000.0,2017-07-19,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
204,208,Manhatan,CHELSEA,01 ONE FAMILY DWELLINGS,1,766,13,,A9,251 WEST 16TH STREET,,10011,1,0,1,1360.0,2736.0,1910,1,A9,,2017-01-06,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
1007,1011,Manhatan,CHINATOWN,07 RENTALS - WALKUP APARTMENTS,2,162,15,,C7,28 MOTT STREET,,10013,17,1,18,2307.0,8577.0,1920,2,C7,,2017-06-16,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
1122,1126,Manhatan,CIVIC CENTER,07 RENTALS - WALKUP APARTMENTS,2B,175,19,,C7,41 WHITE STREET,,10013,7,1,8,2633.0,12000.0,1915,2,C7,141500000.0,2017-07-13,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
1445,1449,Manhatan,CLINTON,07 RENTALS - WALKUP APARTMENTS,2,1053,4,,C7,602 10TH AVENUE,,10036,14,1,15,2217.0,9475.0,1910,2,C7,,2016-11-04,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
1791,1795,Manhatan,EAST VILLAGE,02 TWO FAMILY DWELLINGS,1,436,1,,S2,138 1 AVENUE,,10009,2,1,3,1200.0,3360.0,1900,1,S2,3000000.0,2017-08-31,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
1969,1973,Manhatan,FASHION,07 RENTALS - WALKUP APARTMENTS,2,762,17,,C4,331 WEST 38TH STREET,,10018,16,2,18,2475.0,9690.0,1920,2,C4,,2016-12-23,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
2090,2094,Manhatan,FINANCIAL,08 RENTALS - ELEVATOR APARTMENTS,2,27,9,,D5,63 WALL STREET,,10005,476,6,482,17623.0,400531.0,1929,2,D5,239114603.0,2016-09-30,True,"(-275.912, 72870.4]","(100000.0, 500000.0]"
2586,2590,Manhatan,FLATIRON,08 RENTALS - ELEVATOR APARTMENTS,2,844,8,,D6,9 EAST 16TH STREET,,10003,15,1,16,4600.0,30680.0,1900,2,D6,,2017-01-30,True,"(-275.912, 72870.4]","(0.0, 100000.0]"
3059,3063,Manhatan,GRAMERCY,02 TWO FAMILY DWELLINGS,1,873,31,,S2,141 EAST 17TH STREET,,10003,2,1,3,2300.0,6720.0,1920,1,S2,5300000.0,2017-08-03,True,"(-275.912, 72870.4]","(0.0, 100000.0]"


In [51]:
nyc_sales['DUPLICATED_BLOCK'] =  nyc_sales['BLOCK']

## Removing Constant Columns


A constant column does not aggregate information for any analysis. We usually exclude.

In [52]:
# we can scan the columns with a foor lop and identify which columns have only one possible padding value
colunas_constantes = []
for col in nyc_sales.columns:
    if len(nyc_sales[col].unique()) == 1:
        colunas_constantes.append(col)
    else:
        print(f'{col} OK')

Unnamed: 0 OK
NEIGHBORHOOD OK
BUILDING CLASS CATEGORY OK
TAX CLASS AT PRESENT OK
BLOCK OK
LOT OK
BUILDING CLASS AT PRESENT OK
ADDRESS OK
APARTMENT NUMBER OK
ZIP CODE OK
RESIDENTIAL UNITS OK
COMMERCIAL UNITS OK
TOTAL UNITS OK
LAND SQUARE FEET OK
GROSS SQUARE FEET OK
YEAR BUILT OK
TAX CLASS AT TIME OF SALE OK
BUILDING CLASS AT TIME OF SALE OK
SALE PRICE OK
SALE DATE OK
IS_NULL_APARTMENT_NUMBER OK
LAND SQUARE FEET RANGE OK
GROSS SQUARE FEET RANGE OK
DUPLICATED_BLOCK OK


constant_columns

## Removal By Low Variance

Variance is a measure of how spread out your data is around the mean.

The data from set 2 are more dispersed and, therefore, we say that it has greater variance in relation to the first set.

When we collect data to use in a machine learning model, it is interesting that they have a minimum level of variance. In this way, we will filter columns whose variance is very close to zero (remember that we have already eliminated those that had all the elements the same).


Set 1 below has a single different value (6) that varies by 1 unit in relation to the others. Set 2, in turn, also has only one value different from the others (600) and 100 units more than the others. Proportionally, the two sets vary on the same scale. But by calculating the variance, they are different.

Let's demonstrate how to remove columns (variables with low variance) from the dataset.


First of all we need to scale our data before calculating the variance. Let's do this using Scikit-Learn's MinMaxScaler.

# <img src="https://media.geeksforgeeks.org/wp-content/uploads/min-max-normalisation.jpg" heigh=350 width=350>

In [60]:
data_1 = np.array([5,5,6,5,5])
data_1.mean()

5.2

In [61]:
data_2 = np.array([500, 500, 600, 500, 500])
data_2.mean()

520.0

In [62]:
np.std(data_1)

0.39999999999999997

In [63]:
np.std(data_2)

40.0

In [64]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
data_1_escalado = minmax.fit_transform(data_1.reshape(-1,1))
data_2_escalado = minmax.fit_transform(data_2.reshape(-1,1))

# the reshape is because the function needs the array to be 2 dimensional -> it needs columns

In [65]:
np.var(data_1_escalado)

0.16000000000000006

In [66]:
data_1_escalado

array([[0.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [67]:
np.var(data_2_escalado)

0.16000000000000006

In [68]:
variaveis_numericas_nyc = nyc_sales.select_dtypes(include=['int', 'float'])
variaveis_numericas_escaladas_nyc = minmax.fit_transform(variaveis_numericas_nyc)

In [69]:
variaveis_numericas_escaladas_nyc = pd.DataFrame(variaveis_numericas_escaladas_nyc, columns=variaveis_numericas_nyc.columns)

In [71]:
listLowVariance = []
for col in variaveis_numericas_escaladas_nyc.columns:
    if np.var(variaveis_numericas_escaladas_nyc[col]) > 0.001:
        print(f'{col} has considerable variance')
    else:
        print(f'{col} has low variance')
        listLowVariance.append(col)

Unnamed: 0 has considerable variance
APARTMENT NUMBER has considerable variance
ZIP CODE has considerable variance
RESIDENTIAL UNITS has low variance
COMMERCIAL UNITS has low variance
TOTAL UNITS has low variance
LAND SQUARE FEET has considerable variance
GROSS SQUARE FEET has considerable variance
YEAR BUILT has considerable variance
SALE PRICE has low variance


In [72]:
listLowVariance

['RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS', 'SALE PRICE']