In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
%matplotlib inline

In [18]:
def download(url, filename):
    response = requests.get(url)
    with open(filename, "wb") as f:
        f.write(response.content)

In [19]:
file_path= "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod1.csv"

In [20]:
download(file_path, "laptops.csv")
file_name="laptops.csv"

In [21]:
df = pd.read_csv(file_name, header=0)

<p>Note that we can update the Screen_Size_cm column such that all values are rounded to nearest 2 decimal places by using numpy.round()</p>

In [22]:
df[['Screen_Size_cm']] = np.round(df[['Screen_Size_cm']],2)
df.head()

Unnamed: 0.1,Unnamed: 0,Manufacturer,Category,Screen,GPU,OS,CPU_core,Screen_Size_cm,CPU_frequency,RAM_GB,Storage_GB_SSD,Weight_kg,Price
0,0,Acer,4,IPS Panel,2,1,5,35.56,1.6,8,256,1.6,978
1,1,Dell,3,Full HD,1,1,3,39.62,2.0,4,256,2.2,634
2,2,Dell,3,Full HD,1,1,7,39.62,2.7,8,256,2.2,946
3,3,Dell,4,IPS Panel,2,1,5,33.78,1.6,8,128,1.22,1244
4,4,HP,4,Full HD,2,1,7,39.62,1.8,8,256,1.91,837


<h4>Evaluate the dataset for missing data</h4>

In [23]:
missing_data = df.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

Unnamed: 0
Unnamed: 0
False    238
Name: count, dtype: int64

Manufacturer
Manufacturer
False    238
Name: count, dtype: int64

Category
Category
False    238
Name: count, dtype: int64

Screen
Screen
False    238
Name: count, dtype: int64

GPU
GPU
False    238
Name: count, dtype: int64

OS
OS
False    238
Name: count, dtype: int64

CPU_core
CPU_core
False    238
Name: count, dtype: int64

Screen_Size_cm
Screen_Size_cm
False    234
True       4
Name: count, dtype: int64

CPU_frequency
CPU_frequency
False    238
Name: count, dtype: int64

RAM_GB
RAM_GB
False    238
Name: count, dtype: int64

Storage_GB_SSD
Storage_GB_SSD
False    238
Name: count, dtype: int64

Weight_kg
Weight_kg
False    233
True       5
Name: count, dtype: int64

Price
Price
False    238
Name: count, dtype: int64



#### Replace with mean

In [24]:
# replacing missing data with mean
avg_weight=df['Weight_kg'].astype('float').mean(axis=0)
df.replace({'Weight_kg':np.nan}, avg_weight, inplace=True)

#### Replace with the most frequent value

In [25]:
# replacing missing data with mode
common_screen_size = df['Screen_Size_cm'].value_counts().idxmax()
df.replace({'Screen_Size' : np.nan}, common_screen_size, inplace=True)

<h4>Fixing the data types</h4>

In [26]:
df[["Weight_kg","Screen_Size_cm"]] = df[["Weight_kg","Screen_Size_cm"]].astype("float")

<h4>Data Standardization</h4>

In [27]:
# Data standardization: convert weight from kg to pounds
df["Weight_kg"] = df["Weight_kg"]*2.205
df.rename(columns={'Weight_kg':'Weight_pounds'}, inplace=True)

# Data standardization: convert screen size from cm to inch
df["Screen_Size_cm"] = df["Screen_Size_cm"]/2.54
df.rename(columns={'Screen_Size_cm':'Screen_Size_inch'}, inplace=True)

#### Data Normalization

In [28]:
df['CPU_frequency'] = df['CPU_frequency']/df['CPU_frequency'].max()

### Binning

In [29]:
bins = np.linspace(min(df["Price"]), max(df["Price"]), 4)
group_names = ['Low', 'Medium', 'High']
df['Price-binned'] = pd.cut(df['Price'], bins, labels=group_names, include_lowest=True )

In [33]:
#PLotting the graph
plt.bar(group_names, df["Price-binned"].value_counts())
plt.xlabel("Price")
plt.ylabel("count")
plt.title("Price bins")

Text(0.5, 1.0, 'Price bins')

### Indicator variables
Convert the "Screen" attribute of the dataset into 2 indicator variables, "Screen-IPS_panel" and "Screen-Full_HD". Then drop the "Screen" attribute from the dataset.


In [31]:
#Indicator Variable: Screen
dummy_variable_1 = pd.get_dummies(df["Screen"])
dummy_variable_1.rename(columns={'IPS Panel':'Screen-IPS_panel', 'Full HD':'Screen-Full_HD'}, inplace=True)
df = pd.concat([df, dummy_variable_1], axis=1)

# drop original column "Screen" from "df"
df.drop("Screen", axis = 1, inplace=True)

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,Manufacturer,Category,GPU,OS,CPU_core,Screen_Size_inch,CPU_frequency,RAM_GB,Storage_GB_SSD,Weight_pounds,Price,Price-binned,Screen-Full_HD,Screen-IPS_panel
0,0,Acer,4,2,1,5,14.0,0.551724,8,256,3.528,978,Low,False,True
1,1,Dell,3,1,1,3,15.598425,0.689655,4,256,4.851,634,Low,True,False
2,2,Dell,3,1,1,7,15.598425,0.931034,8,256,4.851,946,Low,True,False
3,3,Dell,4,2,1,5,13.299213,0.551724,8,128,2.6901,1244,Low,False,True
4,4,HP,4,2,1,7,15.598425,0.62069,8,256,4.21155,837,Low,True,False
