# Data Scientist Associate Practical Exam Submission


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Larger scale for plots in notebooks
sns.set_context('notebook')

# Enable multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Setting seed for entire notebook
SEED = 42
np.random.seed(SEED)


## Task 1 | Data Cleaning and Validation

#### Dataset has 1500 rows and 8 columns.
- **owned**:
    - There were no missing values in the rows of this column. Converted datatype from *int* to a *bool*
    - Unique values of *(1,0)* as expected.

- **make_model**:
    - No missing values in this column. Column had 6 unique values as expected, converted from a *pandas object* to *category* datatype.

- **review_month**:
    - No missing values in this column. However, there were 332 unique values instead of the expected 12. Further probing revealed most of the entries had different formats but all included three letters for the month. *Regex* was used to extract those letters and the rest was discarded.
    - Total number of unique values totaled 12 as expected after this process. Converted to an *ordered category* datatype

- **web_browser**:
    - 150 missing values in this column were all replaced with `*unknown*` keyword. The remaining data had 6 unique values as expected.
    - Column has 7 unique values after cleaning. Converted to *category* datatype.

- **reviewer_age**:
    - 105 reviewers did not have an entry for their age. The *mean* age of **33** years computed from the rest of the dataset was used in place of these missing values.
    - The rest of the values were converted from a *pandas object* to an *int* datatype.
    - From the 1500 entries, only 35 unique values were observed. This could be an indication that most of the reviewers can be categorized using age groups.

- **primary_use**:
    - Has no missing values.
    - Two unique values observed as expected. Converted to a *category* datatype.

- **value_for_money**:
    - No missing values
    - 10 unique values as expected. Converted column to a *category* datatype.

- **overall_rating**:
    - No missing values.
    - All values lie in expected range (0 - 25).

#### Dataset has 1500 rows and 8 columns after the cleaning process. 
- No rows or columns were dropped.
- Memory usage for cleaned dataframe dropped from 94KB to 23KB

In [59]:
moped_reviews = pd.read_csv('dataset/electric_bike_ratings_2212.csv')
moped_reviews.shape
moped_reviews.info()

(1500, 8)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   owned            1500 non-null   int64  
 1   make_model       1500 non-null   object 
 2   review_month     1500 non-null   object 
 3   web_browser      1350 non-null   object 
 4   reviewer_age     1500 non-null   object 
 5   primary_use      1500 non-null   object 
 6   value_for_money  1500 non-null   object 
 7   overall_rating   1500 non-null   float64
dtypes: float64(1), int64(1), object(6)
memory usage: 93.9+ KB


In [60]:
moped_reviews.nunique()
moped_reviews['web_browser'].isna().sum()

owned                2
make_model           6
review_month       332
web_browser          6
reviewer_age        36
primary_use          2
value_for_money     10
overall_rating     583
dtype: int64

150

In [61]:
moped_reviews.head()
moped_reviews['reviewer_age'].unique()

Unnamed: 0,owned,make_model,review_month,web_browser,reviewer_age,primary_use,value_for_money,overall_rating
0,1,Nielah-Eyden,Oct,Chrome,23,Commuting,5/10,18.62
1,0,Nielah-Keetra,Jun,,24,Commuting,4/10,15.55
2,0,Lunna-Keetra,23-Oct,Chrome,28,Commuting,5/10,12.76
3,1,Hoang-Keetra,07-Nov,IE,41,Leisure,5/10,17.07
4,1,Lunna-Keetra,16-Sep,Chrome,33,Leisure,5/10,12.29


array(['23', '24', '28', '41', '33', '47', '20', '32', '35', '22', '29',
       '39', '42', '36', '40', '27', '48', '34', '16', '21', '25', '30',
       '-', '46', '37', '31', '17', '44', '26', '38', '49', '50', '45',
       '19', '43', '18'], dtype=object)

In [73]:
moped_reviews_cleaned = moped_reviews.copy()

In [79]:
moped_reviews['owned'].unique()
moped_reviews_cleaned['owned'] = moped_reviews_cleaned['owned'].astype(bool)
moped_reviews_cleaned['owned'].unique()


array([1, 0], dtype=int64)

array([ True, False])

In [80]:
moped_reviews['make_model'].unique()
moped_reviews_cleaned['make_model'] = moped_reviews_cleaned['make_model'].astype('category')


array(['Nielah-Eyden', 'Nielah-Keetra', 'Lunna-Keetra', 'Hoang-Keetra',
       'Lunna-Eyden', 'Hoang-Eyden'], dtype=object)

In [134]:
moped_reviews['review_month'].nunique()
moped_reviews['review_month'].isna().sum()

tes = moped_reviews['review_month'].str.findall(r'[A-Za-z]+').apply(lambda x: ''.join(x))
tes.unique()

month_cat = ['Jan','Feb', 'Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
moped_reviews_cleaned['review_month'] = (moped_reviews_cleaned['review_month']
                                         .str.findall(r'[A-Za-z]+')
                                         .apply(lambda x: ''.join(x))
                                         .astype('category')
                                         .cat.set_categories(month_cat, ordered=True))
moped_reviews_cleaned['review_month'].unique()



332

0

array(['Oct', 'Jun', 'Nov', 'Sep', 'Aug', 'Mar', 'Jul', 'Apr', 'Dec',
       'Feb', 'Jan', 'May'], dtype=object)

['Oct', 'Jun', 'Nov', 'Sep', 'Aug', ..., 'Apr', 'Dec', 'Feb', 'Jan', 'May']
Length: 12
Categories (12, object): ['Jan' < 'Feb' < 'Mar' < 'Apr' ... 'Sep' < 'Oct' < 'Nov' < 'Dec']

In [139]:
moped_reviews['web_browser'].isna().sum()
moped_reviews['web_browser'].nunique()
moped_reviews_cleaned['web_browser'] = moped_reviews_cleaned['web_browser'].fillna('unknown').astype('category')
moped_reviews_cleaned['web_browser'].unique()
moped_reviews_cleaned['web_browser'].isna().sum()


150

6

['Chrome', 'unknown', 'IE', 'Firefox', 'Safari', 'Android', 'Opera']
Categories (7, object): ['Android', 'Chrome', 'Firefox', 'IE', 'Opera', 'Safari', 'unknown']

0

In [141]:
mean_age = moped_reviews[moped_reviews['reviewer_age'] != '-']['reviewer_age'].astype('int').mean().round()

moped_reviews_cleaned['reviewer_age'] = (moped_reviews_cleaned['reviewer_age']
                                         .replace(value=mean_age,to_replace='-')
                                         .astype(np.int8))

len(moped_reviews[moped_reviews['reviewer_age'] == '-'])
moped_reviews_cleaned['reviewer_age'].unique()
moped_reviews_cleaned['reviewer_age'].nunique()


105

array([23, 24, 28, 41, 33, 47, 20, 32, 35, 22, 29, 39, 42, 36, 40, 27, 48,
       34, 16, 21, 25, 30, 46, 37, 31, 17, 44, 26, 38, 49, 50, 45, 19, 43,
       18], dtype=int8)

35

In [81]:
moped_reviews['primary_use'].unique()
moped_reviews_cleaned['primary_use'] = moped_reviews_cleaned['primary_use'].astype('category')


array(['Commuting', 'Leisure'], dtype=object)

In [87]:
moped_reviews_cleaned['value_for_money'] = moped_reviews_cleaned['value_for_money'].str.split('/',expand=True)[0].astype(np.int8)

moped_reviews_cleaned['value_for_money'].unique()


array([ 5,  4,  3,  7,  6,  2,  9,  8, 10,  1], dtype=int8)

In [92]:
moped_reviews['overall_rating'].describe()
moped_reviews['overall_rating'].isna().sum()

count    1500.000000
mean       17.138907
std         2.445566
min        11.690000
25%        15.320000
50%        18.240000
75%        18.840000
max        22.760000
Name: overall_rating, dtype: float64

0

In [135]:
moped_reviews_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   owned            1500 non-null   bool    
 1   make_model       1500 non-null   category
 2   review_month     1500 non-null   category
 3   web_browser      1500 non-null   category
 4   reviewer_age     1500 non-null   int8    
 5   primary_use      1500 non-null   category
 6   value_for_money  1500 non-null   int8    
 7   overall_rating   1500 non-null   float64 
dtypes: bool(1), category(4), float64(1), int8(2)
memory usage: 23.2 KB


## Task 2
*Write your description here*

## Task 3
*Write your description here*

## Task 4
*Write your description here*

## Task 5
*Write your description here*

## Task 6
*Write your description here*

In [None]:
# Start coding here... 

## Task 7
*Write your description here*

In [None]:
# Start coding here... 

## Task 8
*Write your description here*

## Task 9
*Write your description here*

In [None]:
# Start coding here... 

## Task 10
*Write your description here*

## ✅  When you have finished...
- Publish your Workspace using the option on the left
- Check the published version of your report:
	- Can you see everything you want us to grade?
    - Are all the graphics visible?
- Review the grading rubric. Have you included everything that will be graded?
- Head back to the [Certification dashboard](https://app.datacamp.com/certification) to submit your practical exam