# Importing products.csv to DataFrame


In [1]:
import pandas as pd
import numpy as np

In [2]:
url = "https://drive.google.com/file/d/1afxwDXfl-7cQ_qLwyDitfcCx3u7WMvkU/view?usp=share_link" # products.csv
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
products = pd.read_csv(path)

# Performing some basic operations

In [3]:
products.head()

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,499.899,1,8696
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.0,589.996,0,13855401
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.0,569.898,0,1387
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.0,229.997,0,1230
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,1,1364


In [4]:
products.tail()

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
19321,BEL0376,Belkin Travel Support Apple Watch Black,compact and portable stand vertically or horiz...,29.99,269.903,1,12282
19322,THU0060,"Enroute Thule 14L Backpack MacBook 13 ""Black",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392
19323,THU0061,"Enroute Thule 14L Backpack MacBook 13 ""Blue",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392
19324,THU0062,"Enroute Thule 14L Backpack MacBook 13 ""Red",Backpack with capacity of 14 liter compartment...,69.95,649.903,0,1392
19325,THU0063,"Enroute Thule 14L Backpack MacBook 13 ""Green",Backpack with capacity of 14 liter compartment...,69.95,649.903,1,1392


- sku – stock keeping unit: a unique identifier for each product
- name – product name
- desc – product description
- in_stock – whether or not the product was in stock at the moment of the data extraction
- type – a numerical code for product type

In [5]:
products.columns

Index(['sku', 'name', 'desc', 'price', 'promo_price', 'in_stock', 'type'], dtype='object')

## Dimensions of DataFrame

In [6]:
nrows = products.shape[0]
ncols = products.shape[1]
print("The number of rows is", nrows)
print("The number of columns is", ncols)

The number of rows is 19326
The number of columns is 7


Total number of values the data set has:

In [7]:
products.size

135282

## DataFrame description

In [8]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


decribe() can only be performed on int columns. In this case the only int column is in_stock

In [9]:
products.describe()

Unnamed: 0,in_stock
count,19326.0
mean,0.109593
std,0.31239
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [10]:
products.describe(include = "all")

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
count,19326,19326,19319,19280.0,19326,19326.0,19276
unique,10579,10373,7098,2690.0,4614,,126
top,PAC1035,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 8GB | ...",IMac desktop computer 27 inch 8GB RAM 512GB Re...,3649.0,27.895.848,,"5,74E+15"
freq,58,89,230,466.0,229,,6294
mean,,,,,,0.109593,
std,,,,,,0.31239,
min,,,,,,0.0,
25%,,,,,,0.0,
50%,,,,,,0.0,
75%,,,,,,0.0,


In [11]:
# There are 10.580 products
products.name.count()

19326

In [12]:
products.desc.nunique()

7098

In [13]:
products.sku.nunique()

10579

In [14]:
products.name.nunique()

10373

- 1: products that were in stock (19%)
- 0: products that were not in stock (81%)

In [15]:
products['in_stock'].value_counts()

0    17208
1     2118
Name: in_stock, dtype: int64

How is the dataset indexed?
After removing duplicated columns, the index is not going from minimum to maximum value.

In [16]:
products.index

RangeIndex(start=0, stop=19326, step=1)

## Removing duplicated rows
There are duplicated values

In [17]:
products.duplicated().sum()

8746

In [18]:
products = products.drop_duplicates()

In [19]:
products.duplicated().sum()

0

## Exploring columns price and promo_price in more detail.

In [20]:
products.loc[500:512,['price']]

Unnamed: 0,price
500,23.99
501,24.99
502,76.99
503,62.99
504,89.99
505,49.99
506,49.99
507,89.99
508,79.99
509,49.99


Couldn't convert price column to float. There are a lot of errors in this column. For example price of some products are in millions(witch is a lot larger than it should be) and other values are a lot smaller than they should be.

In [21]:
# removing all NaN values from all columns
products_1 = products.dropna()
products_1 = products.dropna(axis=0)
products_1.isnull().sum()

sku            0
name           0
desc           0
price          0
promo_price    0
in_stock       0
type           0
dtype: int64

In [34]:
products_1['price'] = products_1.price.str.replace(".","")
products_1['price'] = products_1.price.astype(float)
products_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10477 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sku          10477 non-null  object 
 1   name         10477 non-null  object 
 2   desc         10477 non-null  object 
 3   price        10477 non-null  float64
 4   promo_price  10477 non-null  object 
 5   in_stock     10477 non-null  int64  
 6   type         10477 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 654.8+ KB


  products_1['price'] = products_1.price.str.replace(".","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products_1['price'] = products_1.price.str.replace(".","")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products_1['price'] = products_1.price.astype(float)


In [22]:
products_1.name.count()

10477

In [23]:
products_1.name.nunique()

10273

In [24]:
products_1.desc.nunique()

7051

In [25]:
products_1.sku.nunique()

10477

In [26]:
# most purchased product name:
products.loc[products.sku.isin(['APP1190'])]

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
3150,APP1190,IPhone AppleCare Protection Plan,Apple Care extended warranty iPhone.,70,609.997,1,1231


## Missing values

There is a small percentage of missing data.

In [27]:
# % of missing.
for col in products.columns:
    percent_missing = np.mean(products[col].isnull())
    print('{} - {}%'.format(col, round(percent_missing*100)))

sku - 0%
name - 0%
desc - 0%
price - 0%
promo_price - 0%
in_stock - 0%
type - 0%


In [28]:
# examining missing values
print("Missing values distribution: ")
print(products.isnull().mean())
print("")

Missing values distribution: 
sku            0.000000
name           0.000000
desc           0.000662
price          0.004348
promo_price    0.000000
in_stock       0.000000
type           0.004726
dtype: float64



In [29]:
# another way
products.isna().sum()

sku             0
name            0
desc            7
price          46
promo_price     0
in_stock        0
type           50
dtype: int64