In [1]:
import pandas as pd, numpy as np
import xlsxwriter, openpyxl
from datetime import datetime as dt

# Data Reading

In [2]:
def csvORexcel(file_Name):
    global path
    path = "C:\\Python\\read\\"+file_Name
    try:
        if file_Name.split('.')[-1].startswith('c'):
            df = pd.read_csv(path)
            return df
        elif file_Name.split('.')[-1].startswith('x'):
            df = pd.read_excel(path)
            return df
    except FileNotFoundError:
        print("The file name {0} has not found".format(path))

In [3]:
states = {'AP':'Andhra Pradesh',
'AR':'Arunachal Pradesh',
'AS':'Assam',
'BR':'Bihar',
'BH':'Bihar',
'CT':'Chhattisgarh',
'CG':'Chhattisgarh',
'DL':'Delhi', 
'GA':'Goa',
'GJ':'Gujarat',
'HR':'Haryana',
'HP':'Himachal Pradesh',
'JH':'Jharkhand',
'KA':'Karnataka',
'KL':'Kerala',
'MP':'Madhya Pradesh',
'MH':'Maharashtra',
'MN':'Manipur',
'ML':'Meghalaya',
'MZ':'Mizoram',
'NL':'Nagaland',
'OR':'Odisha',
'PB':'Punjab',
'RJ':'Rajasthan',
'SK':'Sikkim',
'TN':'Tamil Nadu',
'TG':'Telangana',
'TR':'Tripura',
'UP':'Uttar Pradesh',
'UT':'Uttarakhand',
'WB':'West Bengal'}

# Data Cleaning

In [4]:
def cleaner(data0):
    # Remove last row.
    #data0.drop(index = data0[data0.Createdon.isna()].index, inplace=True)
    
    # Replace null values
    if 'Scheme/Doc GUID' in data0.columns.to_list():
        data0['Scheme/Doc GUID'].fillna('a', inplace=True)
    else:
        try:
            data0['Scheme Guid'].fillna('a', inplace=True)
        except Exception:
            pass
    data0['Citizen Name'].fillna('a', inplace=True)
    data0['HD Name'].fillna('blank', inplace=True)
    data0['Citizen Mobile'].fillna(0, inplace=True)
    
    # Changing status values and keeping only "Open/Submit/BR"
    #try:
    #    data0['Status'] = data0['Status'].apply(lambda x: 'Open' if x == 'Data complete' else 'Submitted' if (x=='Docket submitted' or x=='Document ready') else "Benefit Received" if x=='Scheme/Document received' else x)
    #except KeyError:
    #    pass
        
    # Changing Case Organization values from state initials to full state name.
    #try:
    #    data0['Case Organization'] = data0['Case Organization'].apply(lambda x: states[x[:2]])
    #except KeyError:
    #    pass
    
    # Renaming column "Case Organiisation" & "Case District" to "State" & "Disctrict"
    #try:
    #    data0.rename(columns={"Case Organization":"State","Case District":"District"}, inplace=True)
    #except KeyError:
    #    pass
    
    # Convert Mobile column from float to string for concatenation.
    data0['Citizen Mobile'] = data0['Citizen Mobile'].apply(lambda x: str(x).strip())
    #data0['HD Mobile'] = data0['HD Mobile'].apply(lambda x: str(x).strip())
    #data0['Mobile'] = data0['Mobile'].astype('int64')
    data0['Citizen Mobile'] = data0['Citizen Mobile'].astype('str')
    #data0['HD Mobile'] = data0['HD Mobile'].astype('str')
    
    # Changing Scheme column name to 'Scheme Name'.
    if 'Scheme Name' not in data0.columns.to_list():
        for col in data0.columns.to_list():
            if 'e/' in col:
                data0.rename(columns={col:'Scheme Name'},inplace=True)
                break
                
    # Checking duplicate records based on citizrn mobile number.
    duplicates = data0[data0.duplicated(['Citizen Mobile'], keep=False)] # Keeping duplicate records
    data0 = data0.drop(index = data0[data0.duplicated(['Citizen Mobile'], keep='last')].index) # Removing duplicate records.
    
    # Adding column to check if HD mobile = Citizen mobile.
    #data0['mob_similarity'] = [True if i==j else False for i,j in zip(data0['Citizen Mobile'], data0['HD Mobile'])]
    #mob_sim = data0[data0['mob_similarity'] == True]
    #data0 = data0[data0['mob_similarity'] == False]
        
    # Change gender from initial letter to full form.
    #try:
    #    data0['Gender'] = data0['Gender'].apply(lambda x: 'Male' if x=='M' else 'Female' if x=='F' else 'Other' if x=='O' else x)
    #except KeyError:
    #    continue
    
    # Convert "Createdon", "Docket Submitted Date", "Benefit received Date" column data type to Datetime format
    #dt_col = ['Createdon', 'Docket Submitted Date', 'Benefit received Date', 'DOB']
    
    #for col in dt_col:
    #    try:
    #        data0[col] = pd.to_datetime(data0[col], format='mixed', errors='ignore')
    #    except KeyError:
    #        continue
    
    # Deleting records with status "Case Aborted" and "Application rejected"
    #rejectedDF = data0[(data0.Status == 'Case Aborted') | (data0.Status == 'Application rejected')] # Storing prev step deleted data
    #data0 = data0[(data0['Status'] != 'Case Aborted') & (data0['Status'] != 'Application rejected')]
    
    # Fill missing HD IDs with 'Not Mapped'.
    #data0[['HD ID', 'HD Name']] = data0[['HD ID', 'HD Name']].fillna('Not Mapped')
    
    # Removing DFL cases data.
    #data0["Scheme Category"] = data0['Scheme/Doc GUID'].apply(lambda x: "DFL" if (x=="SH0009SW" or x=="SH000AG6" or x=="SH000A32" or x=="SH0009SW" or x=="SH000AG6" or x=="SH000BM6") else "E-Gov")
    #dfl = data0[data0["Scheme Category"] == 'DFL']
    #data0 = data0[data0["Scheme Category"] == 'E-Gov']
    
    data0.reset_index(inplace=True, drop=True)
    duplicates.reset_index(inplace=True, drop=True)
    return(data0, duplicates)

# Sampling data

In [5]:
def sampling(data0):
    fr = round(int(input('Please provide sampling percentage'))/100,2)
    samp = pd.DataFrame()
    try:
        for hd in data0['HD ID'].value_counts().index:
            for sid in data0[data0['HD ID'] == hd]['Scheme/Doc GUID'].value_counts().index:
                conditional_data = data0[(data0['HD ID'] == hd) & (data0['Scheme/Doc GUID'] == sid)]
                if len(conditional_data) == 1:
                    if len(samp) == 0:
                        samp = conditional_data
                    else:
                        samp = pd.concat([samp, conditional_data])
                elif len(conditional_data) == 2:
                    if len(samp) == 0:
                        samp = conditional_data.sample(n=1, random_state=1, replace=False)
                    else:
                        samp = pd.concat([samp,conditional_data.sample(n=1, random_state=1, replace=False)], ignore_index=False)
                elif len(conditional_data) <= 4:
                    if len(samp) == 0:
                        samp = conditional_data.sample(frac=fr, random_state=1, replace=False)
                    else:
                        samp = pd.concat([samp,conditional_data.sample(frac=fr, random_state=1, replace=False)], ignore_index=False)
                else:
                    if len(samp) == 0:
                        samp = conditional_data.sample(frac=fr, random_state=1, replace=False)
                    else:
                        samp = pd.concat([samp,conditional_data.sample(frac=fr, random_state=1, replace=False)], ignore_index=False)    
    except Exception:
        try:
            for hd in data0['HD Name'].value_counts().index:
                for sid in data0[data0['HD Name'] == hd]['Scheme Guid'].value_counts().index:
                    conditional_data = data0[(data0['HD Name'] == hd) & (data0['Scheme Guid'] == sid)]
                    if len(conditional_data) == 1:
                        if len(samp) == 0:
                            samp = conditional_data
                        else:
                            samp = pd.concat([samp, conditional_data])
                    elif len(conditional_data) == 2:
                        if len(samp) == 0:
                            samp = conditional_data.sample(n=1, random_state=1, replace=False)
                        else:
                            samp = pd.concat([samp,conditional_data.sample(n=1, random_state=1, replace=False)], ignore_index=False)
                    elif len(conditional_data) <= 4:
                        if len(samp) == 0:
                            samp = conditional_data.sample(frac=fr, random_state=1, replace=False)
                        else:
                            samp = pd.concat([samp,conditional_data.sample(frac=fr, random_state=1, replace=False)], ignore_index=False)
                    else:
                        if len(samp) == 0:
                            samp = conditional_data.sample(frac=fr, random_state=1, replace=False)
                        else:
                            samp = pd.concat([samp,conditional_data.sample(frac=fr, random_state=1, replace=False)], ignore_index=False)
        except Exception:
            for hd in data0['HD Name'].value_counts().index:
                for sid in data0[data0['HD Name'] == hd]['Scheme Name'].value_counts().index:
                    conditional_data = data0[(data0['HD Name'] == hd) & (data0['Scheme Name'] == sid)]
                    if len(conditional_data) == 1:
                        if len(samp) == 0:
                            samp = conditional_data
                        else:
                            samp = pd.concat([samp, conditional_data])
                    elif len(conditional_data) == 2:
                        if len(samp) == 0:
                            samp = conditional_data.sample(n=1, random_state=1, replace=False)
                        else:
                            samp = pd.concat([samp,conditional_data.sample(n=1, random_state=1, replace=False)], ignore_index=False)
                    elif len(conditional_data) <= 4:
                        if len(samp) == 0:
                            samp = conditional_data.sample(frac=fr, random_state=1, replace=True)
                        else:
                            samp = pd.concat([samp,conditional_data.sample(frac=fr, random_state=1, replace=False)], ignore_index=False)
                    else:
                        if len(samp) == 0:
                            samp = conditional_data.sample(frac=fr, random_state=1, replace=False)
                        else:
                            samp = pd.concat([samp,conditional_data.sample(frac=fr, random_state=1, replace=False)], ignore_index=False)
    
    samp.reset_index(inplace=True, drop=True)
    samp['sampling'] = str(fr*100)+'%' # Adding a column to identify sampled records
    data0 = data0.merge(samp[['Case ID','sampling']], how='left', on='Case ID') # Merging sampled identified column with unique data
    data0.sampling.fillna(value = str(100.0-(fr*100))+'%', inplace=True) # Filling missing values which have not identified
    remain=data0[data0.sampling == str(100.0-(fr*100))+'%'] # Filtering remaining data and storing with new variable.
    
    # Removing sampling column
    data0 = data0.drop(columns = 'sampling')
    samp = samp.drop(columns = 'sampling')
    remain = remain.drop(columns = 'sampling')
    
    return(samp, data0, remain, fr)

# Data Exporting

In [6]:
def export_to_excel(samp, data0, remain, duplicates, fn, fr):
    # Exporting data of unique records to Excel file.
    with pd.ExcelWriter('C:\\Python\\export\\sample_data_'+fn.split('.')[0]+'.xlsx') as writer:
        samp.to_excel(writer, sheet_name=str(fr*100)+'% sampling_1', index=False)
        remain.to_excel(writer, sheet_name=str(100.0-(fr*100))+'% sampling_2', index=False)
        data0.to_excel(writer, sheet_name='unique data', index=False)
        if duplicates.shape[0]>0:
            duplicates.to_excel(writer, sheet_name='duplicates', index=False)
        #mob_sim.to_excel(writer, sheet_name='mobile similarity', index=False)
        #dfl.to_excel(writer, sheet_name='DFL data', index=False)
        #rejectedDF.to_excel(writer, sheet_name='rejected cases', index=False)
    print('Data of project {0} exported to excel.'.format(fn.split('.')[0]))

In [7]:
file_Name = input('Please provide file name = ')
data0 = csvORexcel(file_Name)
fn = file_Name
print('Getting sample data of ',fn)
data0, duplicates = cleaner(data0)
samp, data0, remain, fr = sampling(data0)
export_to_excel(samp, data0, remain, duplicates, fn, fr)

Please provide file name =  Valid Proof_AP - Colgate Palmolive India Limited (Phase 2)_703.xlsx


Getting sample data of  Valid Proof_AP - Colgate Palmolive India Limited (Phase 2)_703.xlsx


Please provide sampling percentage 80


Data of project Valid Proof_AP - Colgate Palmolive India Limited (Phase 2)_703 exported to excel.
