# Load data from your local drive

Start by writing the following and run the code cell :

In [None]:
from google.colab import files
data_to_load = files.upload()

2. You will be asked to select a file. Click on “Choose files” then select and download the csv file of your choice. You should see the name of the file displayed after Colab downloads it.

3. Finally, write the following code to import your file into a Pandas DataFrame (make sure the file name matches the name of the downloaded file).

## Read/Open Data

Import Libraries and Data

In [None]:
import pandas as pd
from google.colab import data_table
data_table.enable_dataframe_formatter()

###excel

In [None]:
import io
df = pd.read_excel(io.BytesIO(data_to_load['APPEND_ALL_DATASET.xlsx']))

In [None]:
df = pd.DataFrame(data=df)
df.head()

###CSV

In [None]:
data = pd.read_csv('/content/sample_data/california_housing_test.csv')

In [None]:
data

##Alternative Open Large dataset

###Chunk Pandas

In [None]:
#load profil
allprofil = pd.DataFrame()
for chunk in pd.read_csv("identifier_profil.csv", chunksize=1000000, low_memory=False):
    allprofil = pd.concat([allprofil,chunk])


###Modin

In [None]:
import ray
ray.init(num_cpus=4)
import modin.pandas as mpd

#export MODIN_OUT_OF_CORE=true

In [None]:
modin_data = mpd.read_csv('PDPS_PROFIL_21SEPT.csv', sep='|')

## Merge Multiple Files to ONE file

In [None]:
!pip install -q xlrd
import pandas as pd
df1 = pd.read_excel('/content/DATASET01.xlsx')
df2 = pd.read_excel('/content/DATASET02.xlsx')
df3 = pd.read_excel('/content/DATASET03.xlsx')


In [None]:
import os
import glob
import pandas as pd
os.chdir("/content")

extension = 'xlsx'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

#combine all files in the list
combined = pd.concat([pd.read_excel(f) for f in all_filenames ])
#export to csv
#combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')



## Check Data Properties
Total Line of rows, Data Properties

In [None]:
len(df1)+len(df2)+len(df3)

In [None]:
#combined.info()
df.info()

In [None]:
df.shape

In [None]:
df.describe()

## Change Datatype

In [None]:
df['no_passport'] = df['no_passport'].apply(str)
df['no_daftar_majikan'] = df['no_daftar_majikan'].apply(str)
df['id_jenis_pekerjaan'] = df['id_jenis_pekerjaan'].apply(str)

In [None]:
bant['nilai_bantuan_diterima'] = bant['nilai_bantuan_diterima'].astype('float')
bant['kadar_bantuan_diterima'] = bant['kadar_bantuan_diterima'].astype('float')

##Rename Column

In [None]:
#rename header with index

mapping = {
    df.columns[0]:"no_kp2",
    df.columns[1]:"no_kp_lama2"
    }

batch1 = df.rename(columns=mapping)
batch1.head()

In [None]:
df = df.rename(columns = {'nama': 'nama2'}, inplace = False)
df = df.rename(columns = {'no_tel': 'no_tel2'}, inplace = False)

In [None]:
df.head()

# Data Wrangling

## Fix leading zero

In [None]:
df['no_kp'] = df['no_kp'].apply(str)
df['no_kp'] = df['no_kp'].apply(lambda x:'{0:0>12}'.format(x))
df.no_kp = df.no_kp.apply("{}".format)

In [None]:
df['no_kp'].head() 

In [None]:
df[df['no_kp'].str.contains('000')]
#zero = df[df['no_kp'].str.contains('000')]
#zero.to_csv('zerotest3.csv', sep ='|', index=False)

## Check Duplication

In [None]:
#check duplication
df.duplicated(subset=['no_kp']).sum()
df.loc[df.duplicated(subset=['no_kp'], keep=False), :]
df.drop_duplicates(keep='last')

In [None]:
df.duplicated(subset=['no_kp']).sum()

##all string to uppercase

In [None]:
df = df.applymap(lambda s: s.upper() if type(s) == str else s)

##remove unwanted character alamat_1/ #remove unicode ?

In [None]:
df['alamat_1'] = df['alamat_1'].astype(str)
df['alamat_1'].head()
df['alamat_1'] =  df['alamat_1'].str.replace('[^\x00-\x7F]','', regex=True)

##convert tarikh_mula, tarikh_tamat to formating

In [None]:
import datetime

In [None]:
df['tarikh_mula'] = df['tarikh_mula'].astype('datetime64[ns]')
df['tarikh_tamat'] = df['tarikh_tamat'].astype('datetime64[ns]')

##NaT_NULL to None

In [None]:
import numpy as np
df['tarikh_mula'].replace(np.nan,'None', inplace=True)
df['tarikh_tamat'].replace(np.nan,'None', inplace=True)

##Try converting your NaN to None since MySQL understands None and in the UI you'll see NULL

In [None]:
df = df.where(pd.notnull(df), None)


##save float in 2decimal

In [None]:
df.to_csv('programtest.csv', float_format='%.2f', date_format='%Y-%m-%d %H:%M:%S', index=False, sep='|')

In [None]:
df_copy[''] = df
df_copy.index += 1 

In [None]:
df_copy.index

In [None]:
df_copy.head()

##Summary (SUM/COUNT) report NA by agensi by variable

In [None]:
df.set_index('id_agensi').isna().sum(level=0)

In [None]:
b3 = df.set_index('id_agensi').isna().sum(level=0)
b3.to_csv('byagensi.csv')


## Validation Data

In [None]:
!pip install sidetable
import sidetable as stb

In [None]:
df.stb.freq(['tahun_lahir'])

In [None]:
df.stb.freq(['id_agensi'], style=True)

In [None]:
df.reset_index(inplace=True)

In [None]:
#exact search
df['id'] = df['id'].apply(str)
df[df['id'].str.contains(r'\b18\b')]

In [None]:
df[df['nama'].str.contains("'")]

#Recode Values

##Parlimen ID

In [None]:
df.stb.freq(['id_parlimen'])

In [None]:
df['id_parlimen']=df['id_parlimen'].replace({
    "P001":1,
    "P002":2,
    "P003":3,
    "P004":4,
    "P005":5,
    "P006":6,
    "P007":7,
    "P008":8,
    "P009":9,
    "P010":10,
    "P011":11,
    "P012":12,
    "P013":13,
    "P014":14,
    "P015":15,
    "P016":16,
    "P017":17,
    "P018":18,
    "P019":19,
    "P020":20,
    "P021":21,
    "P022":22,
    "P023":23,
    "P024":24,
    "P025":25,
    "P026":26,
    "P027":27,
    "P028":28,
    "P029":29,
    "P030":30,
    "P031":31,
    "P032":32,
    "P033":33,
    "P034":34,
    "P035":35,
    "P036":36,
    "P037":37,
    "P038":38,
    "P039":39,
    "P040":40,
    "P041":41,
    "P042":42,
    "P043":43,
    "P044":44,
    "P045":45,
    "P046":46,
    "P047":47,
    "P048":48,
    "P049":49,
    "P050":50,
    "P051":51,
    "P052":52,
    "P053":53,
    "P054":54,
    "P055":55,
    "P056":56,
    "P057":57,
    "P058":58,
    "P059":59,
    "P060":60,
    "P061":61,
    "P062":62,
    "P063":63,
    "P064":64,
    "P065":65,
    "P066":66,
    "P067":67,
    "P068":68,
    "P069":69,
    "P070":70,
    "P071":71,
    "P072":72,
    "P073":73,
    "P074":74,
    "P075":75,
    "P076":76,
    "P077":77,
    "P078":78,
    "P079":79,
    "P080":80,
    "P081":81,
    "P082":82,
    "P083":83,
    "P084":84,
    "P085":85,
    "P086":86,
    "P087":87,
    "P088":88,
    "P089":89,
    "P090":90,
    "P091":91,
    "P092":92,
    "P093":93,
    "P094":94,
    "P095":95,
    "P096":96,
    "P097":97,
    "P098":98,
    "P099":99,
    "P100":100,
    "P101":101,
    "P102":102,
    "P103":103,
    "P104":104,
    "P105":105,
    "P106":106,
    "P107":107,
    "P108":108,
    "P109":109,
    "P110":110,
    "P111":111,
    "P112":112,
    "P113":113,
    "P114":114,
    "P115":115,
    "P116":116,
    "P117":117,
    "P118":118,
    "P119":119,
    "P120":120,
    "P121":121,
    "P122":122,
    "P123":123,
    "P124":124,
    "P125":125,
    "P126":126,
    "P127":127,
    "P128":128,
    "P129":129,
    "P130":130,
    "P131":131,
    "P132":132,
    "P133":133,
    "P134":134,
    "P135":135,
    "P136":136,
    "P137":137,
    "P138":138,
    "P139":139,
    "P140":140,
    "P141":141,
    "P142":142,
    "P143":143,
    "P144":144,
    "P145":145,
    "P146":146,
    "P147":147,
    "P148":148,
    "P149":149,
    "P150":150,
    "P151":151,
    "P152":152,
    "P153":153,
    "P154":154,
    "P155":155,
    "P156":156,
    "P157":157,
    "P158":158,
    "P159":159,
    "P160":160,
    "P161":161,
    "P162":162,
    "P163":163,
    "P164":164,
    "P165":165,
    "P166":166,
    "P167":167,
    "P168":168,
    "P169":169,
    "P170":170,
    "P171":171,
    "P172":172,
    "P173":173,
    "P174":174,
    "P175":175,
    "P176":176,
    "P177":177,
    "P178":178,
    "P179":179,
    "P180":180,
    "P181":181,
    "P182":182,
    "P183":183,
    "P184":184,
    "P185":185,
    "P186":186,
    "P187":187,
    "P188":188,
    "P189":189,
    "P190":190,
    "P191":191,
    "P192":192,
    "P193":193,
    "P194":194,
    "P195":195,
    "P196":196,
    "P197":197,
    "P198":198,
    "P199":199,
    "P200":200,
    "P201":201,
    "P202":202,
    "P203":203,
    "P204":204,
    "P205":205,
    "P206":206,
    "P207":207,
    "P208":208,
    "P209":209,
    "P210":210,
    "P211":211,
    "P212":212,
    "P213":213,
    "P214":214,
    "P215":215,
    "P216":216,
    "P217":217,
    "P218":218,
    "P219":219,
    "P220":220,
    "P221":221,
    "P222":222
    })

In [None]:
df['id_parlimen']

In [None]:
df.stb.freq(['id_strata'], style=True)
#df['id_strata'].dtypes

In [None]:
df.stb.freq(['id_agama'], style=True)

In [None]:
df.stb.freq(['id_status_kahwin'], style=True)

In [None]:
df.stb.freq(['id_taraf_pendidikan'], style=True)

#Split Table [Profil | Bantuan]

##Rename Columns Header

In [None]:
mapping = {
df.columns[0]:"no_kp",
df.columns[1]:"no_kp_lama",
df.columns[2]:"no_sijil_kelahiran",
df.columns[3]:"no_passport",
df.columns[4]:"nama",
df.columns[5]:"no_tel",
df.columns[6]:"tahun_lahir",
df.columns[7]:"id_jantina",
df.columns[8]:"id_agama",
df.columns[9]:"etnik",
df.columns[10]:"id_kluster_etnik",
df.columns[11]:"id_warganegara",
df.columns[12]:"emel",
df.columns[13]:"id_status_kahwin",
df.columns[14]:"alamat_1",
df.columns[15]:"alamat_2",
df.columns[16]:"alamat_3",
df.columns[17]:"poskod",
df.columns[18]:"id_negeri",
df.columns[19]:"id_daerah",
df.columns[20]:"id_mukim",
df.columns[21]:"id_parlimen",
df.columns[22]:"id_dun",
df.columns[23]:"id_strata",
df.columns[24]:"id_taraf_pendidikan",
df.columns[25]:"id_sijil",
df.columns[26]:"id_jenis_pemilikan_kediaman",
df.columns[27]:"id_jenis_tempat_kediaman",
df.columns[28]:"id_jenis_pemilikan_kenderaan",
df.columns[29]:"id_jenis_kenderaan",
df.columns[30]:"no_plat",
df.columns[31]:"id_status_pekerjaan",
df.columns[32]:"id_jenis_pekerjaan",
df.columns[33]:"nama_majikan",
df.columns[34]:"no_daftar_majikan",
df.columns[35]:"id_jenis_kemahiran",
df.columns[36]:"id_penyakit",
df.columns[37]:"no_daftar_oku",
df.columns[38]:"id_jenis_oku",
df.columns[39]:"id_rawatan",
df.columns[40]:"no_akaun_bank",
df.columns[41]:"id_bank",
df.columns[42]:"id_jenis_akaun_bank",
df.columns[43]:"nilai_sumber_pendapatan",
df.columns[44]:"id_sumber_pendapatan",
df.columns[45]:"jumlah_pendapatan_bulanan",
df.columns[46]:"nilai_bantuan_diterima",
df.columns[47]:"kadar_bantuan_diterima",
df.columns[48]:"id_status_bantuan",
df.columns[49]:"tarikh_lulus",
df.columns[50]:"tarikh_mula",
df.columns[51]:"tarikh_tamat",
df.columns[52]:"id_kategori_bantuan",
df.columns[53]:"bidang_fokus",
df.columns[54]:"id_jenis_sub_kategori",
df.columns[55]:"id_teras",
df.columns[56]:"id_kaedah_pemberian",
df.columns[57]:"id_kekerapan",
df.columns[58]:"id_sektor",
df.columns[59]:"id_status_pelaksanaan",
df.columns[60]:"id_kumpulan_sasar",
df.columns[61]:"id_program",
df.columns[62]:"id_agensi",
df.columns[63]:"id_kementerian",
df.columns[64]:"skop",
df.columns[65]:"papar_umum",
df.columns[66]:"sebab_tidak_aktif",
df.columns[67]:"syarat_program",
df.columns[68]:"status_pelaksanaan",
df.columns[69]:"ptarikh_mula",
df.columns[70]:"ptarikh_tamat",
df.columns[71]:"jumlah_peruntukan",
df.columns[72]:"url"
}
df2 = df.rename(columns=mapping)

In [None]:
df2.columns

In [None]:
df2.head()

##Split Profil

##Split Bantuan

In [None]:
bantuan = df[['id',
#'id_profil',
#'id_program',
'id_status_bantuan',
'nilai_bantuan_diterima',
'kadar_bantuan_diterima',
'tarikh_mula',
'tarikh_tamat',
'tarikh_lulus']]

In [None]:
bantuan.head()