In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats



In [2]:
df_smartphone = pd.read_json('/Users/brunobernardo/E-commerce-/Smartphones/search_output.jsonl')

In [3]:
df_smartphone

Unnamed: 0,title,url,rating,review_count,price
0,SAMSUNG Galaxy S20 FE 5G Factory Unlocked Andr...,https://www.amazon.com/gp/slredirect/picassoRe...,4.6 out of 5 stars,4936,$599.99
1,"Xgody X3 Smartphone Unlocked, 6.26” HD Perfora...",https://www.amazon.com/gp/slredirect/picassoRe...,3.3 out of 5 stars,37,$84.99
2,"Samsung Galaxy A21s (A217F) 128GB, Dual-SIM, 6...",https://www.amazon.com/Samsung-Dual-SIM-Infini...,4.1 out of 5 stars,62,$219.99
3,"Nokia 3V TA-1182 16GB 6.26"" HD+ Display 13MP C...",https://www.amazon.com/Nokia-TA-1182-Smartphon...,3.3 out of 5 stars,12,
4,DOOGEE S59 Pro Unlocked Rugged Smartphone 1005...,https://www.amazon.com/DOOGEE-S59-Pro-Smartpho...,4.3 out of 5 stars,119,$239.99
...,...,...,...,...,...
1139,Moto G7 Plus | Unlocked | Made for US by Motor...,https://www.amazon.com/Moto-Unlocked-Motorola-...,4.5 out of 5 stars,1898,
1140,Motorola One Fusion | Unlocked | GSM Only | 4/...,https://www.amazon.com/Motorola-Fusion-Unlocke...,4.4 out of 5 stars,161,$149.99
1141,Moto G Stylus 5G | 2021 | 2-Day Battery | Unlo...,https://www.amazon.com/Stylus-Battery-Unlocked...,4.7 out of 5 stars,18,$399.99
1142,"Samsung Galaxy A42 5G, Factory Unlocked Smartp...",https://www.amazon.com/gp/slredirect/picassoRe...,5.0 out of 5 stars,3,$349.99


In [4]:
df_smartphone.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1144 entries, 0 to 1143
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1144 non-null   object
 1   url           1144 non-null   object
 2   rating        1086 non-null   object
 3   review_count  1086 non-null   object
 4   price         838 non-null    object
dtypes: object(5)
memory usage: 44.8+ KB


In [5]:
df_smartphone.shape

(1144, 5)

In [6]:
df_smartphone['price'] = df_smartphone['price'].str.replace('$', '')
df_smartphone['price']

0       599.99
1        84.99
2       219.99
3         None
4       239.99
         ...  
1139      None
1140    149.99
1141    399.99
1142    349.99
1143     65.99
Name: price, Length: 1144, dtype: object

In [7]:
df_smartphone.head()

Unnamed: 0,title,url,rating,review_count,price
0,SAMSUNG Galaxy S20 FE 5G Factory Unlocked Andr...,https://www.amazon.com/gp/slredirect/picassoRe...,4.6 out of 5 stars,4936,599.99
1,"Xgody X3 Smartphone Unlocked, 6.26” HD Perfora...",https://www.amazon.com/gp/slredirect/picassoRe...,3.3 out of 5 stars,37,84.99
2,"Samsung Galaxy A21s (A217F) 128GB, Dual-SIM, 6...",https://www.amazon.com/Samsung-Dual-SIM-Infini...,4.1 out of 5 stars,62,219.99
3,"Nokia 3V TA-1182 16GB 6.26"" HD+ Display 13MP C...",https://www.amazon.com/Nokia-TA-1182-Smartphon...,3.3 out of 5 stars,12,
4,DOOGEE S59 Pro Unlocked Rugged Smartphone 1005...,https://www.amazon.com/DOOGEE-S59-Pro-Smartpho...,4.3 out of 5 stars,119,239.99


In [8]:
df_smartphone.dtypes

title           object
url             object
rating          object
review_count    object
price           object
dtype: object

In [9]:
# Not surprisingly the Sales column is stored as an object. 
# The ‘$’ and ‘,’ are dead giveaways that the Sales column is not a numeric column. 
#.  More than likely we want to do some math on the column so let’s try to convert it to a float.

In the real world data set, you may not be so quick to see that there are non-numeric values in the column. In my data set, my first approach was to try to use astype()

df_smartphone['price'].astype('float')

ValueError: could not convert string to float: '1,298.00'

In [10]:
# The traceback includes a ValueError and shows that it could not convert the $1,000.00 string to a float.

#  Ok. That should be easy to clean up.

#   Basically, I assumed that an object column contained all strings. 

#    In reality, an object column can contain a mixture of multiple types.

#     Let’s look at the types in this data set.

In [12]:
df_smartphone['price'].apply(type)

0            <class 'str'>
1            <class 'str'>
2            <class 'str'>
3       <class 'NoneType'>
4            <class 'str'>
               ...        
1139    <class 'NoneType'>
1140         <class 'str'>
1141         <class 'str'>
1142         <class 'str'>
1143         <class 'str'>
Name: price, Length: 1144, dtype: object

In [None]:
# This nicely shows the issue. 
#  The apply(type) code runs the type function on each value in the column.
#   As you can see, some of the values are NoneType and some are strings. 
#    Overall, the column dtype is an object.

In [None]:
# First, we can add a formatted column that shows each type:

In [None]:
df_smartphone['price_Type'] = df_smartphone['price'].apply(lambda x: type(x).__name__)

In [None]:
# here is a more compact way to check the types of data in a column using value_counts()

In [14]:
df_smartphone['price'].apply(type).value_counts()

<class 'str'>         838
<class 'NoneType'>    306
Name: price, dtype: int64

In [15]:
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace('$', '').replace(',', ''))
    return(x)

In [None]:
# This function will check if the supplied value is a string and if it is, 
# will remove all the characters we don’t need. 
# If it is not a string, then it will return the original value.

In [16]:
# Here is how we call it and convert the results to a float. I also show the column with the types:

df_smartphone['price'] = df_smartphone['price'].apply(clean_currency).astype('float')
df_smartphone['price_Type'] = df_smartphone['price'].apply(lambda x: type(x).__name__)

In [17]:
# We can also check the dtypes

df_smartphone.dtypes

title            object
url              object
rating           object
review_count     object
price           float64
price_Type       object
dtype: object

In [18]:
# Or look at the value_counts 

df_smartphone['price'].apply(type).value_counts()

<class 'float'>    1144
Name: price, dtype: int64