Data Integration and Analysis - Chadi Amzil
----------

## Assignment Objectives
-----------

In this lab, I will: 

1. Use `requests` to retrieve data from a website using an API (application programming interface)
2. Understand `json` objects and their structures
4. Write data into a `csv` file, following a specific format
5. Read a `csv` file into a Pandas DataFrame
7. Use the apply and lambda functions to perform data manipulation on DataFrames columns

------------
<br>

In [1]:
import requests
import json
import pandas as pd

In [None]:
# Suppress warnings generated by the code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [3]:
url = 'https://fakerapi.it/api/v1/companies?_quantity=50'
response = requests.get(url)

In [4]:
# displaying the url
print('requested url:', response.url)
 #displaying the status code
print('status of the request:', response.status_code)

requested url: https://fakerapi.it/api/v1/companies?_quantity=50
status of the request: 200


In [5]:
# Converting the json_response to a python dictionary
json_response = response.json()

In [6]:
# Displaying the keys of the json object
print(json_response.keys())
# Printing the value corresponding to the status key
print(json_response['status'])
# Printing the value corresponding to the code key
print(json_response['code'])
# storing the total key in the variable num_records
num_records = json_response['total']

dict_keys(['status', 'code', 'locale', 'seed', 'total', 'data'])
OK
200


In [7]:

# displaying the keys of the first record of the 'data key of json_response
json_response['data'][0].keys()

dict_keys(['id', 'name', 'email', 'vat', 'phone', 'country', 'addresses', 'website', 'image', 'contact'])

In [8]:
# Displaying the first company information 
json_response['data'][0]


{'id': 1,
 'name': 'Cormier Inc',
 'email': 'howe.richard@hirthe.com',
 'vat': '617433365',
 'phone': '+17746762097',
 'country': 'Colombia',
 'addresses': [{'id': 1,
   'street': '2566 Kerluke Cove',
   'streetName': 'Emily Motorway',
   'buildingNumber': '69597',
   'city': 'East William',
   'zipcode': '76400-6979',
   'country': 'Ecuador',
   'country_code': 'EC',
   'latitude': 36.917053,
   'longitude': -117.095049},
  {'id': 1,
   'street': '850 Zemlak Drive Suite 344',
   'streetName': 'Marcos Plains',
   'buildingNumber': '526',
   'city': 'Windlerstad',
   'zipcode': '03734',
   'country': 'Croatia',
   'country_code': 'HR',
   'latitude': -85.134648,
   'longitude': 146.195843},
  {'id': 1,
   'street': '713 Wintheiser Spur',
   'streetName': 'Lane Forge',
   'buildingNumber': '9917',
   'city': 'Shayneton',
   'zipcode': '31864',
   'country': 'Brunei',
   'country_code': 'BN',
   'latitude': -6.361593,
   'longitude': 96.723999}],
 'website': 'http://bashirian.biz',
 'imag

In [None]:
# write the following company's information into two csv file:
# companies_info.csv: name, email, phone, country, website
# companies_contact.csv: company_name, firstname, lastname, phone, birthday, gender

# Important note: Some company names include commas. Remove comma characters from the 
# company's name before writing it to files.

with open('companies_info.csv', 'w') as file1, open('companies_contact.csv', 'w') as file2:
    file1.write('name,email,phone,country,website')
    file2.write('company,firstname,lastname,email,phone,birthday,gender')
    
    for indx in range(num_records):
        list1 = []
        list2 = []
        
        # append the json values to list1's content for file companies_info.csv
        # Add company name (remove commas so CSV stays clean)
        list1.append(json_response['data'][indx]['name'].replace(',', ''))
        # Add company email 
        list1.append(json_response['data'][indx]['email'])
        # Add company phone
        list1.append(json_response['data'][indx]['phone'])
        # Add company country
        list1.append(json_response['data'][indx]['country'])
        # Add company website
        list1.append(json_response['data'][indx]['website'])
        # write the row to company_info.csv (join list items with comma)
        file1.write('\n' + ','.join(list1))
        # append the json values to list2's content for file companies_contact.csv
        # Add company name (remove commas so CSV stays clean)
        list2.append(json_response['data'][indx]['name'].replace(',', ''))
        # Add the contact's first name
        list2.append(json_response['data'][indx]['contact']['firstname'])
        # Add the contact's last name
        list2.append(json_response['data'][indx]['contact']['lastname'])
        # Add the contact's email
        list2.append(json_response['data'][indx]['contact']['email'])
        # Add the contact's phone number
        list2.append(json_response['data'][indx]['contact']['phone'])
        # Add the contact's birthday
        list2.append(json_response['data'][indx]['contact']['birthday'])
        # Add the contact's gender
        list2.append(json_response['data'][indx]['contact']['gender'])
        # write the row to company_contact.csv (join list items with comma)
        file2.write('\n' + ','.join(list2))

In [10]:
# Reading the file companies_contact using pandas
df = pd.read_csv('companies_contact.csv')


In [11]:
# printing the shape of the data
print(f'data shape: {df.shape}')
#printing the number of records(size) and columns(features)
print(f'dataset size: {df.shape[0]}\nnumber of features: {df.shape[1]}')


data shape: (50, 7)
dataset size: 50
number of features: 7


In [12]:
# using the .head function to show the top 5 rows of df
df.head()


Unnamed: 0,company,firstname,lastname,email,phone,birthday,gender
0,Cormier Inc,Sadye,Halvorson,elias04@yahoo.com,15392898821,1990-04-23,female
1,Hettinger Adams and Hill,Pascale,Terry,igottlieb@runolfsson.org,16298175353,1976-06-11,female
2,Bogan-O'Reilly,Nettie,Renner,blanda.zackary@yahoo.com,12527432695,1981-02-15,female
3,Hintz Mayert and Hegmann,Jaylin,Lesch,price.lilian@hotmail.com,16694396384,2000-02-23,male
4,Turner PLC,Freida,Waters,howe.river@funk.com,14582103138,1996-12-09,female


In [13]:
# confirming the datatypes of each of the columns
df.dtypes

company      object
firstname    object
lastname     object
email        object
phone         int64
birthday     object
gender       object
dtype: object

In [14]:
# converting phone datatype to string for easier manipulation
df.phone=df.phone.astype('str')
# verification
df.dtypes

company      object
firstname    object
lastname     object
email        object
phone        object
birthday     object
gender       object
dtype: object

In [15]:
#using the apply lambda function to add 00 to all numbers and then storing it to df['phone'] to save the change
df['phone'] = df['phone'].apply(lambda x: '00'+x)
# verification
df.head()

Unnamed: 0,company,firstname,lastname,email,phone,birthday,gender
0,Cormier Inc,Sadye,Halvorson,elias04@yahoo.com,15392898821,1990-04-23,female
1,Hettinger Adams and Hill,Pascale,Terry,igottlieb@runolfsson.org,16298175353,1976-06-11,female
2,Bogan-O'Reilly,Nettie,Renner,blanda.zackary@yahoo.com,12527432695,1981-02-15,female
3,Hintz Mayert and Hegmann,Jaylin,Lesch,price.lilian@hotmail.com,16694396384,2000-02-23,male
4,Turner PLC,Freida,Waters,howe.river@funk.com,14582103138,1996-12-09,female


In [16]:
# filtering the gender column so that only females show and ensuring that all the values are in lower case, and storing it to df_female
df_female = df[df['gender'].str.lower()== 'female']

# verification
df_female.head()

Unnamed: 0,company,firstname,lastname,email,phone,birthday,gender
0,Cormier Inc,Sadye,Halvorson,elias04@yahoo.com,15392898821,1990-04-23,female
1,Hettinger Adams and Hill,Pascale,Terry,igottlieb@runolfsson.org,16298175353,1976-06-11,female
2,Bogan-O'Reilly,Nettie,Renner,blanda.zackary@yahoo.com,12527432695,1981-02-15,female
4,Turner PLC,Freida,Waters,howe.river@funk.com,14582103138,1996-12-09,female
5,Daugherty PLC,Fay,Koch,antwon43@gleichner.com,18505427176,2020-01-24,female


In [None]:
# ignore any warnings
#dropping the gender column using df.drop function 
df_female.drop(['gender'], inplace= True, axis=1) ##
#printing the dataframe to verify successful deletion of the gender column
df_female.head()

Unnamed: 0,company,firstname,lastname,email,phone,birthday
0,Cormier Inc,Sadye,Halvorson,elias04@yahoo.com,15392898821,1990-04-23
1,Hettinger Adams and Hill,Pascale,Terry,igottlieb@runolfsson.org,16298175353,1976-06-11
2,Bogan-O'Reilly,Nettie,Renner,blanda.zackary@yahoo.com,12527432695,1981-02-15
4,Turner PLC,Freida,Waters,howe.river@funk.com,14582103138,1996-12-09
5,Daugherty PLC,Fay,Koch,antwon43@gleichner.com,18505427176,2020-01-24


In [18]:
# Create a function that takes a string object that contains a date in the form 'yyyy-mm-dd' and returns
# 0: if the birth year is less than 1980
# 1: otherwise.

from datetime import datetime

def ageFlag(date_str):
#converting the string into a datetime object
# '%Y-%m-%d' is the date format
    date_val = datetime.strptime(date_str, '%Y-%m-%d')
# return 0 for years that are less than 1980 and 1 for everything else
    if date_val.year < 1980:
        return 0
    else:
        return 1

In [19]:
# Show that the values in the column 'birthday' is converted to 0 and 1 after performing the task
# applying the ageFlag function into the birthday column and storing it in the 'birthday' column in df_female
df_female['birthday']=df_female['birthday'].apply(ageFlag)
# verification
df_female.head()

Unnamed: 0,company,firstname,lastname,email,phone,birthday
0,Cormier Inc,Sadye,Halvorson,elias04@yahoo.com,15392898821,1
1,Hettinger Adams and Hill,Pascale,Terry,igottlieb@runolfsson.org,16298175353,0
2,Bogan-O'Reilly,Nettie,Renner,blanda.zackary@yahoo.com,12527432695,1
4,Turner PLC,Freida,Waters,howe.river@funk.com,14582103138,1
5,Daugherty PLC,Fay,Koch,antwon43@gleichner.com,18505427176,1


In [20]:
#renaming the column to bornbefore1980 to match the ageflag function
df_female.rename(columns={'birthday': 'bornbefore1980'}, inplace=True)
df_female.head()

Unnamed: 0,company,firstname,lastname,email,phone,bornbefore1980
0,Cormier Inc,Sadye,Halvorson,elias04@yahoo.com,15392898821,1
1,Hettinger Adams and Hill,Pascale,Terry,igottlieb@runolfsson.org,16298175353,0
2,Bogan-O'Reilly,Nettie,Renner,blanda.zackary@yahoo.com,12527432695,1
4,Turner PLC,Freida,Waters,howe.river@funk.com,14582103138,1
5,Daugherty PLC,Fay,Koch,antwon43@gleichner.com,18505427176,1


In [21]:
# creating a csv file named femalecontacts.csv which will store the df_female data
df_female.to_csv('femalecontacts.csv', index= False)