## Cleaning the Adobo transactions dataset

Correct problematic data entries using the pandas operations in `D2N1_Pandas_Data_Processing.ipynb`

In [1]:
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
os.chdir('/content/drive/MyDrive/my_workspace')

# 1. Read csv

In [4]:
df = pd.read_csv('Data/cc_dirty.csv')
df

Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt
0,6.760000e+11,M,Dasmarinas,"659,019 people",Chartered loss adjuster,12/12/1958,798000000000,798000000000,a72eaa86b043eed95b25bbb25b3153a1,1581314011,shopping_net,$68.88
1,3.520000e+15,M,Digos,"169,393 people","Administrator, charities/voluntary organisations",31/08/1970,968000000000,968000000000,060d12f91c13871a13963041736a4702,1590902968,entertainment,$50.06
2,4.140000e+18,Male,Calapan,"133,893 people",Financial controller,23/07/1953,628000000000,628000000000,18aafb6098ab0923886c0ac83592ef8d,1585461157,food_dining,$105.44
3,4.870000e+15,M,San Fernando,"121,812 people",,18/07/1964,863000000000,863000000000,45bbe714e51ab8a375454d39a190b0cb,1613063704,food_dining,$38.61
4,4.720000e+15,M,Laoag,"111,125 people",Dance movement psychotherapist,11/01/1954,257000000000,257000000000,c20ee88b451f637bc6893b7460e9fee0,1601282159,gas_transport,$82.69
...,...,...,...,...,...,...,...,...,...,...,...,...
111872,4.620000e+15,M,San Fernando,"121,812 people",Personnel officer,20/11/1934,555000000000,555000000000,6ced184c93e66028e8d235ad3060de90,1625341374,personal_care,$31.37
111873,3.710000e+14,F,Maasin,"85,560 people",,18/04/1966,994000000000,994000000000,0d1facd7a2af816a6fbb404a8ad6d77a,1630115183,grocery_pos,$83.14
111874,4.430000e+15,M,General Santos,"594,446 people",,26/06/1960,319000000000,319000000000,599d3eec9b385473aa3791b620691416,1636919758,home,$74.76
111875,3.800000e+13,M,City of Isabela,"112,788 people",Sport and exercise psychologist,04/03/1967,378000000000,378000000000,67f47400873e964e10e029050dbe1fee,1605863062,shopping_pos,$3.27


# 2. View unique entries in each column

In [5]:
# replace column name
df['gender'].unique()

array(['M', 'Male', 'F', 'Female'], dtype=object)

# 3. Make a tally for each unique value in each column

In [6]:
# replace column name
df['gender'].value_counts()

gender
M         73836
Male      31692
F          3812
Female     2537
Name: count, dtype: int64

# 4. Count NaNs (if any) in each column

In [7]:
# replace column name
df['gender'].isna().sum()

0

# 5. Check if there are duplicated rows and keep only the first instance

In [8]:
df[df.duplicated()]

Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt
542,2.470000e+15,Male,San Fernando,"306,659 people","Surveyor, quantity",01/11/1935,581000000000,581000000000,5c3870bc2fba1f8b3053301d4755cc93,1619237689,gas_transport,$56.48
1399,6.760000e+11,Male,Dasmarinas,"659,019 people",Chartered loss adjuster,12/12/1958,798000000000,798000000000,10ec92c08938a39bfb9d78a2669b3b37,1586604673,grocery_pos,$84.71
1737,4.140000e+18,Male,Calapan,"133,893 people",Financial controller,23/07/1953,628000000000,628000000000,b511aabc59c44e85742fe271c50c5277,1605248178,grocery_pos,$116.35
2171,4.060000e+15,M,Valenzuela,"620,422 people",Operational investment banker,06/12/1963,231000000000,231000000000,7788194c54508b5ae6b7bc72b2c0c288,1618046260,entertainment,$4.77
2318,2.320000e+15,M,Binan,"333,028 people","Teacher, early years/pre",08/02/1960,664000000000,664000000000,9fc5855a3dd66dafd2f10103ed56dca9,1586698232,kids_pets,$30.91
...,...,...,...,...,...,...,...,...,...,...,...,...
111856,3.520000e+15,M,Masbate,"95,389 people",Wellsite geologist,20/11/1967,531000000000,531000000000,43d23627e51fb2c25d45f07f0b70020f,1638122056,personal_care,$16.46
111866,3.010000e+13,M,Manila,"23,088,000 people",Database administrator,20/11/1948,989000000000,989000000000,14f30e3868ca56b563ea666fadbfebae,1637632301,shopping_pos,$4.61
111868,6.390000e+11,Female,Dumaguete City,"131,377 people","Accountant, chartered certified",12/01/1942,878000000000,878000000000,afabb3c578d4bdb8071bb4dfce8b0574,1626015968,kids_pets,$19.78
111873,3.710000e+14,F,Maasin,"85,560 people",,18/04/1966,994000000000,994000000000,0d1facd7a2af816a6fbb404a8ad6d77a,1630115183,grocery_pos,$83.14


In [10]:
df = df.drop_duplicates()

# 6. Replace 'Male' and 'Female' in `gender` by 'M' and 'F'

In [12]:
df['gender'] = df['gender'].str.replace('Male','M')
df['gender'] = df['gender'].str.replace('Female','F')
df['gender'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'] = df['gender'].str.replace('Male','M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'] = df['gender'].str.replace('Female','F')


gender
M    94296
F     5704
Name: count, dtype: int64

# 7. Remove "$" in `amt` and convert to float
*Hint: convert to desired data type using method `.astype()`*

In [13]:
df['amt'] = df['amt'].str.replace('$','').astype(float)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['amt'] = df['amt'].str.replace('$','').astype(float)


Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt
0,676000000000.0,M,Dasmarinas,"659,019 people",Chartered loss adjuster,12/12/1958,798000000000,798000000000,a72eaa86b043eed95b25bbb25b3153a1,1581314011,shopping_net,68.88
1,3520000000000000.0,M,Digos,"169,393 people","Administrator, charities/voluntary organisations",31/08/1970,968000000000,968000000000,060d12f91c13871a13963041736a4702,1590902968,entertainment,50.06
2,4.14e+18,M,Calapan,"133,893 people",Financial controller,23/07/1953,628000000000,628000000000,18aafb6098ab0923886c0ac83592ef8d,1585461157,food_dining,105.44
3,4870000000000000.0,M,San Fernando,"121,812 people",,18/07/1964,863000000000,863000000000,45bbe714e51ab8a375454d39a190b0cb,1613063704,food_dining,38.61
4,4720000000000000.0,M,Laoag,"111,125 people",Dance movement psychotherapist,11/01/1954,257000000000,257000000000,c20ee88b451f637bc6893b7460e9fee0,1601282159,gas_transport,82.69


# 8. Remove "people" and "," in `city_pop`and convert to integer
*Hint: convert to desired data type using method `.astype()`*

In [14]:
df['city_pop'] = df['city_pop'].str.replace(' people','').str.replace(',','').astype(int)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['city_pop'] = df['city_pop'].str.replace(' people','').str.replace(',','').astype(int)


Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt
0,676000000000.0,M,Dasmarinas,659019,Chartered loss adjuster,12/12/1958,798000000000,798000000000,a72eaa86b043eed95b25bbb25b3153a1,1581314011,shopping_net,68.88
1,3520000000000000.0,M,Digos,169393,"Administrator, charities/voluntary organisations",31/08/1970,968000000000,968000000000,060d12f91c13871a13963041736a4702,1590902968,entertainment,50.06
2,4.14e+18,M,Calapan,133893,Financial controller,23/07/1953,628000000000,628000000000,18aafb6098ab0923886c0ac83592ef8d,1585461157,food_dining,105.44
3,4870000000000000.0,M,San Fernando,121812,,18/07/1964,863000000000,863000000000,45bbe714e51ab8a375454d39a190b0cb,1613063704,food_dining,38.61
4,4720000000000000.0,M,Laoag,111125,Dance movement psychotherapist,11/01/1954,257000000000,257000000000,c20ee88b451f637bc6893b7460e9fee0,1601282159,gas_transport,82.69


# **Optional, For intermediate/advanced learners**:
## Column `unix_time` is a unique system for machines to internally tracking time, in UTC. Can you convert it to a human-readable date and local time in "YYYY-MM-DD HH:MM:SS" format and store in column named 'trans_datetime`?




In [15]:
# Convert UNIX timestamps to datetime, assume UTC
df['trans_datetime'] = pd.to_datetime(df['unix_time'], unit='s', utc=True)

# Convert to PHT time
df['trans_datetime'] = df['trans_datetime'].dt.tz_convert('Asia/Shanghai')

# Format datetime to 'yyyy-mm-dd hh:mm:ss'
df['trans_datetime'] = df['trans_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trans_datetime'] = pd.to_datetime(df['unix_time'], unit='s', utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trans_datetime'] = df['trans_datetime'].dt.tz_convert('Asia/Shanghai')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trans_datetime'] = df['trans_datetime'].dt.strftime

Unnamed: 0,cc_num,gender,city,city_pop,job,dob,acct_num,acct_num2,trans_num,unix_time,category,amt,trans_datetime
0,676000000000.0,M,Dasmarinas,659019,Chartered loss adjuster,12/12/1958,798000000000,798000000000,a72eaa86b043eed95b25bbb25b3153a1,1581314011,shopping_net,68.88,2020-02-10 13:53:31
1,3520000000000000.0,M,Digos,169393,"Administrator, charities/voluntary organisations",31/08/1970,968000000000,968000000000,060d12f91c13871a13963041736a4702,1590902968,entertainment,50.06,2020-05-31 13:29:28
2,4.14e+18,M,Calapan,133893,Financial controller,23/07/1953,628000000000,628000000000,18aafb6098ab0923886c0ac83592ef8d,1585461157,food_dining,105.44,2020-03-29 13:52:37
3,4870000000000000.0,M,San Fernando,121812,,18/07/1964,863000000000,863000000000,45bbe714e51ab8a375454d39a190b0cb,1613063704,food_dining,38.61,2021-02-12 01:15:04
4,4720000000000000.0,M,Laoag,111125,Dance movement psychotherapist,11/01/1954,257000000000,257000000000,c20ee88b451f637bc6893b7460e9fee0,1601282159,gas_transport,82.69,2020-09-28 16:35:59


# 9. Save cleaned dataframe as "cc_clean.csv"

In [16]:
df.to_csv('Data/cc_clean.csv', index=False)