In [6]:
import pandas as pd
from glob import glob
import os
import re
import numpy as np

In [7]:
selected_data = []
base_dir = "/Users/dhingratul/Documents/parsed"
write_dir = "/Users/dhingratul/Documents/instate_data"
chunk_size = 1000000
unsupported_states = ["himachal", "wb", "tn"]

In [8]:
def select_csv_folder(base_dir):
    all_data = []
    for fn in sorted(glob(os.path.join(base_dir, "*.csv"))):
        state_split = re.split("[- _ : + .]", os.path.basename(fn))
        state = state_split[0]
        print(f"Processing state: {state}")
        if "guj" in state_split[1:]:
            print(f"Skipping non-supported state: {state}")
        else:
            print(f"Processing, csv folder,  state {state}")
            columns = pd.read_csv(fn, index_col=0, nrows=0).columns.tolist()
            if "name" in columns:
                df = pd.read_csv(
                    fn,
                    usecols=["name", "state", "father_or_husband_name", "sex"],
                )
                df = df.rename(columns={"name": "elector_name"})
            else:
                df = pd.read_csv(
                    fn,
                    usecols=["elector_name", "state", "father_or_husband_name", "sex"],
                )
            df["state"] = state
            all_data.append(df)
    return pd.concat(all_data)

In [9]:
def _establish_last_name(name, father_name):
    if name is np.nan:
        name = "FNU"
    if father_name is np.nan:
        father_name = "FNU"
    if len(name.split()) > 1:
        last_name = name.split()[-1]
    else:
        if len(father_name.split()) > 1:
            last_name = father_name.split()[-1]
        else:
            last_name = "LNU"
    return last_name


def establish_last_name(df):
    print("Cleaning Data, processing last names")
    df["last_name"] = df.apply(
        lambda x: _establish_last_name(x["elector_name"], x["father_or_husband_name"]),
        axis=1,
    )
    df = df.drop(["father_or_husband_name", "elector_name"], axis=1)
    return df

In [17]:
# # "*csv"
df_csv = select_csv_folder(base_dir)
df_csv = establish_last_name(df_csv)
selected_data.append(df_csv)

Cleaning Data, processing last names


In [18]:
df_csv.head()

Unnamed: 0,sex,state,last_name
0,Male,andaman,DATTA
1,Female,andaman,DEVI
2,Male,andaman,KRISHNA
3,Male,andaman,SEKHAR
4,Male,andaman,LNU


In [19]:
# *.7z, pre-req: Extract .7z file to a folder manually using system unzipper
for f in [
    name
    for name in os.listdir(base_dir)
    if os.path.isdir(os.path.join(base_dir, name))
]:
    folder_split = re.split("[- _ : +]", os.path.basename(f))
    state = folder_split[0]
    if "guj" in folder_split[1:]:
        print(f"Skipping unsupported state: {state}")
    else:
        print(f"Processing, 7z,  {state}")
        df_7z = select_csv_folder(os.path.join(base_dir, f))
        df_7z = establish_last_name(df_7z)
        selected_data.append(df_7z)

Processing, 7z,  andhra
Processing state: andhra
Processing, csv folder,  state andhra
Processing state: andhra
Processing, csv folder,  state andhra
Processing state: andhra
Processing, csv folder,  state andhra
Processing state: andhra
Processing, csv folder,  state andhra
Processing state: andhra
Processing, csv folder,  state andhra
Processing state: andhra
Processing, csv folder,  state andhra
Processing state: andhra
Processing, csv folder,  state andhra
Cleaning Data, processing last names
Skipping unsupported state: dadra


In [20]:
def select_gz_chunk(df, state_split):
    all_data = []
    if "clean" in state_split:
        df = df[["elector_name_t13n", "state", "father_or_husband_name_t13n", "sex"]]
        df = df.rename(
            columns={
                "elector_name_t13n": "elector_name",
                "father_or_husband_name_t13n": "father_or_husband_name",
            }
        )
    else:
        df = df[["elector_name", "state", "father_or_husband_name", "sex"]]
    df["state"] = state_split[0]
    all_data.append(df)
    return pd.concat(all_data)

In [21]:
# *.gz.csv, pre-req: use scripts/concatenate.py to merge .partaa, .partab, etc files
for base_path in sorted(glob(os.path.join(base_dir, "*.csv.gz"))):
    state_split = re.split("[- _ : +]", os.path.basename(base_path))
    state = state_split[0]
    if state in unsupported_states:
        print(f"Skipping unsupported state: {state}")
    else:
        df_test = pd.read_csv(base_path, index_col=0, nrows=1)
        columns = df_test.columns.tolist()
        for df in pd.read_csv(base_path, chunksize=chunk_size):
            print(f"Processing, gz folder,  state: {state}, chunk {chunk_size}")
            df_gz = select_gz_chunk(df, state_split)
            df_gz = establish_last_name(df_gz)
            selected_data.append(df_gz)
final_df = pd.concat(selected_data)
final_df = final_df[final_df.last_name.str.isalpha()]
final_df["last_name"] = final_df["last_name"].str.lower()

  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: assam, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: bihar, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: chandigarh, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: delhi, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: guj, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: har, chunk 1000000
Cleaning Data, processing last names
Skipping unsupported state: himachal


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: jha, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kar, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Cleaning Data, processing last names
Processing, gz folder,  state: kerala, chunk 1000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Cleaning Data, processing last names
Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: kerala, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: maharashtra, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: mp, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 10

  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1

  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: odi, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: punjab, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: rajasthan, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: sikkim, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]
  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Processing, gz folder,  state: tel, chunk 1000000
Cleaning Data, processing last names


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["state"] = state_split[0]


Skipping unsupported state: tn


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tripura, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tripura, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: tripura, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: up, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: utt, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: utt, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: utt, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: utt, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: utt, chunk 1000000
Cleaning Data, processing last names


  for df in pd.read_csv(base_path, chunksize=chunk_size):


Processing, gz folder,  state: utt, chunk 1000000
Cleaning Data, processing last names
Processing, gz folder,  state: utt, chunk 1000000
Cleaning Data, processing last names
Skipping unsupported state: wb


In [26]:
MIN_OCCURENCE = 3

def process_data(df):
    df = df[df.last_name != "lnu"]  # Remove last name unknows
    df = df[
        df.groupby("last_name")["last_name"].transform("count").ge(MIN_OCCURENCE)
    ]  # Remove all last names that occur less than MIN_OCCURANCE times
    df = df[(df.state != "wb")]
    df = df[(df.state != "tn")]
    df = df[df.last_name.str.len() > 2]
    df = df[df['sex'].notna()]
    return df

In [27]:
final_df.head()

Unnamed: 0,sex,state,last_name
0,Male,andaman,datta
1,Female,andaman,devi
2,Male,andaman,krishna
3,Male,andaman,sekhar
4,Male,andaman,lnu


In [28]:
processed_df = process_data(final_df)

In [29]:
processed_df.head()

Unnamed: 0,sex,state,last_name
0,Male,andaman,datta
1,Female,andaman,devi
2,Male,andaman,krishna
3,Male,andaman,sekhar
7,Male,andaman,toppo


In [45]:
def export_csv_gz(df, write_dir):
    path_to_write = os.path.join(write_dir, f"instate_processed_clean.csv.gz")
    print(f"Writing dataframe to path: {path_to_write}")
    df.to_csv(path_to_write, compression="gzip", index=False)

In [31]:
export_csv_gz(processed_df, write_dir)


Writing dataframe to path: /Users/dhingratul/Documents/instate_data/instate_processed.csv.gz


In [10]:
processed_df = pd.read_csv(os.path.join(write_dir, f"instate_processed.csv.gz"))

In [11]:
processed_df.shape

(427763118, 3)

In [12]:
processed_df_clean =  processed_df[processed_df["last_name"].str.contains('[a-z]',  na=False)]

In [13]:
processed_df_clean.shape

(427699190, 3)

In [14]:
processed_df_clean["last_name"]

0                 datta
1                  devi
2               krishna
3                sekhar
4                 toppo
                ...    
427763113      chouhaan
427763114      chouhaan
427763115        kumaar
427763116        raanee
427763117    prathibhaa
Name: last_name, Length: 427699190, dtype: object

In [15]:
res = pd.read_csv("/Users/dhingratul/Documents/new/instate/ms/out/dnn_pred_final.csv")

In [16]:
len(processed_df_clean["last_name"].unique())

1141479

In [17]:
processed_df_clean.shape

(427699190, 3)

In [18]:
res.head()

Unnamed: 0.1,Unnamed: 0,last_name,total_freq_n,total_freq,gt_state,lstm_pred,rnn_pred,gru_pred
0,0,muphees,1.421982e-06,122.0,"['up', 'maharashtra', 'guj', 'mp']",1,1,1
1,1,raanoobaai,3.496676e-08,3.0,['maharashtra'],1,0,1
2,2,pilibhamtara,3.496676e-08,3.0,['kar'],1,1,1
3,3,shamsani,3.496676e-08,3.0,['andhra'],0,0,0
4,4,njjaadaakaa,4.662235e-08,4.0,['odi'],1,1,1


In [19]:
processed_df_clean = processed_df_clean[processed_df['sex'].notna()]

  processed_df_clean = processed_df_clean[processed_df['sex'].notna()]


In [20]:
processed_df_agg = processed_df_clean.groupby(["last_name"]).agg(list).reset_index(drop=False)

In [21]:
processed_df_agg.head(100)

Unnamed: 0,last_name,sex,state
0,aaa,"[Male, Female, Female, Female, Female, Female,...","[andhra, delhi, delhi, delhi, delhi, delhi, de..."
1,aaaa,"[Male, Male, Male, पुरूष, पुरूष, Male, Female,...","[assam, har, har, jha, jha, maharashtra, odi, ..."
2,aaaaa,"[Male, Female, Female, Male, Female, Female, F...","[up, up, up, up, up, up, up, up, up, up, up, u..."
3,aaaaaa,"[Male, Female, Female, Male, Female, Female, F...","[maharashtra, up, up, up, up, up, up, up, up, ..."
4,aaaabaaraav,"[Male, Male, Female, Male, Male]","[maharashtra, maharashtra, maharashtra, mahara..."
...,...,...,...
95,aaambaa,"[Female, Male, Female, Female, Female, Female,...","[guj, guj, guj, guj, guj, guj, guj, guj, guj, ..."
96,aaambaabaaee,"[Female, Female, Female, Female, Female, Femal...","[maharashtra, maharashtra, maharashtra, mahara..."
97,aaambaaben,"[Female, Female, Female, Female, Female, Femal...","[guj, guj, guj, guj, guj, guj, guj, guj, guj, ..."
98,aaambaabhaa,"[Male, Male, Male, Male, Male, Male, Male, Fem...","[guj, guj, guj, guj, guj, guj, guj, maharashtra]"


In [23]:
df_merged = pd.merge(res, processed_df_agg, how='inner', on = 'last_name')

In [24]:
df_merged.shape

(2995, 10)

In [28]:
res.shape

(3000, 8)

In [29]:
df_merged.head()

Unnamed: 0.1,Unnamed: 0,last_name,total_freq_n,total_freq,gt_state,lstm_pred,rnn_pred,gru_pred,sex,state
0,0,muphees,1.421982e-06,122.0,"['up', 'maharashtra', 'guj', 'mp']",1,1,1,"[Male, Male, Male, Male, Male, Male, Female, M...","[guj, guj, guj, guj, guj, guj, maharashtra, ma..."
1,1,raanoobaai,3.496676e-08,3.0,['maharashtra'],1,0,1,"[Female, Female, Female]","[maharashtra, maharashtra, maharashtra]"
2,2,pilibhamtara,3.496676e-08,3.0,['kar'],1,1,1,"[Female, Male, Female]","[kar, kar, kar]"
3,3,shamsani,3.496676e-08,3.0,['andhra'],0,0,0,"[Female, Female, Male]","[andhra, andhra, andhra]"
4,4,njjaadaakaa,4.662235e-08,4.0,['odi'],1,1,1,"[Male, Female, Female, Female]","[odi, odi, odi, odi]"


In [30]:
df_merged.iloc[0]["sex"]

['Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Female',
 'Female',
 'Female',
 'Female',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Female',
 'Female',
 'Female',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male

In [31]:
def _count_female_prop(sex_list):
    male_ctr = 0
    female_ctr = 0
    for _sex in sex_list:
        if _sex == "Male" or _sex == 'पुरूष':
            male_ctr += 1
        elif _sex == "UNREADABLE":
            pass
        else:
            female_ctr += 1
    if male_ctr == 0 and female_ctr == 0:
        return np.nan
    else:
        return female_ctr/(male_ctr+female_ctr)

In [32]:
df_merged["female_prop"] = np.nan

In [33]:
df_merged.head()

Unnamed: 0.1,Unnamed: 0,last_name,total_freq_n,total_freq,gt_state,lstm_pred,rnn_pred,gru_pred,sex,state,female_prop
0,0,muphees,1.421982e-06,122.0,"['up', 'maharashtra', 'guj', 'mp']",1,1,1,"[Male, Male, Male, Male, Male, Male, Female, M...","[guj, guj, guj, guj, guj, guj, maharashtra, ma...",
1,1,raanoobaai,3.496676e-08,3.0,['maharashtra'],1,0,1,"[Female, Female, Female]","[maharashtra, maharashtra, maharashtra]",
2,2,pilibhamtara,3.496676e-08,3.0,['kar'],1,1,1,"[Female, Male, Female]","[kar, kar, kar]",
3,3,shamsani,3.496676e-08,3.0,['andhra'],0,0,0,"[Female, Female, Male]","[andhra, andhra, andhra]",
4,4,njjaadaakaa,4.662235e-08,4.0,['odi'],1,1,1,"[Male, Female, Female, Female]","[odi, odi, odi, odi]",


In [34]:
def compute_female_prop(df_merged):
    df_merged["female_prop"] = df_merged.apply(
        lambda x: _count_female_prop(x["sex"]),
        axis=1,
    )
    return df_merged

In [35]:
df_merged_prop = compute_female_prop(df_merged)

In [37]:
df_merged_prop.iloc[0]["sex"]

['Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Female',
 'Female',
 'Female',
 'Female',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Female',
 'Female',
 'Female',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Female',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male',
 'Male

In [40]:
df_merged_prop.head(25)

Unnamed: 0.1,Unnamed: 0,last_name,total_freq_n,total_freq,gt_state,lstm_pred,rnn_pred,gru_pred,sex,state,female_prop
0,0,muphees,1.421982e-06,122.0,"['up', 'maharashtra', 'guj', 'mp']",1,1,1,"[Male, Male, Male, Male, Male, Male, Female, M...","[guj, guj, guj, guj, guj, guj, maharashtra, ma...",0.184874
1,1,raanoobaai,3.496676e-08,3.0,['maharashtra'],1,0,1,"[Female, Female, Female]","[maharashtra, maharashtra, maharashtra]",1.0
2,2,pilibhamtara,3.496676e-08,3.0,['kar'],1,1,1,"[Female, Male, Female]","[kar, kar, kar]",0.666667
3,3,shamsani,3.496676e-08,3.0,['andhra'],0,0,0,"[Female, Female, Male]","[andhra, andhra, andhra]",0.666667
4,4,njjaadaakaa,4.662235e-08,4.0,['odi'],1,1,1,"[Male, Female, Female, Female]","[odi, odi, odi, odi]",0.75
5,5,bradoo,1.864894e-07,16.0,"['delhi', 'har', 'maharashtra']",0,0,0,"[Male, Female, Male, Male, Male, Female, Femal...","[delhi, delhi, delhi, delhi, delhi, delhi, del...",0.4375
6,6,dhakkumalla,6.993352e-08,6.0,['andhra'],1,1,1,"[Female, Male, Female, Male, Male, Male]","[andhra, andhra, andhra, andhra, andhra, andhra]",0.333333
7,7,umaraavasimg,1.550193e-06,133.0,"['maharashtra', 'guj', 'mp']",1,1,1,"[Male, Male, Male, Male, Female, Female, Femal...","[guj, guj, maharashtra, maharashtra, maharasht...",0.285714
8,8,edavheen,6.993352e-08,6.0,['maharashtra'],0,0,0,"[Male, Male, Male, Female, Male, Female]","[maharashtra, maharashtra, maharashtra, mahara...",0.333333
9,9,loobhagaa,1.282115e-07,11.0,['jha'],0,0,0,"[महिला, पुरूष, महिला, महिला, पुरूष, पुरूष, महि...","[jha, jha, jha, jha, jha, jha, jha, jha, jha, ...",0.545455


In [41]:
df_merged_prop = df_merged_prop.drop(['Unnamed: 0', 'sex', 'state'], axis=1)

In [42]:
df_merged_prop.head(10)

Unnamed: 0,last_name,total_freq_n,total_freq,gt_state,lstm_pred,rnn_pred,gru_pred,female_prop
0,muphees,1.421982e-06,122.0,"['up', 'maharashtra', 'guj', 'mp']",1,1,1,0.184874
1,raanoobaai,3.496676e-08,3.0,['maharashtra'],1,0,1,1.0
2,pilibhamtara,3.496676e-08,3.0,['kar'],1,1,1,0.666667
3,shamsani,3.496676e-08,3.0,['andhra'],0,0,0,0.666667
4,njjaadaakaa,4.662235e-08,4.0,['odi'],1,1,1,0.75
5,bradoo,1.864894e-07,16.0,"['delhi', 'har', 'maharashtra']",0,0,0,0.4375
6,dhakkumalla,6.993352e-08,6.0,['andhra'],1,1,1,0.333333
7,umaraavasimg,1.550193e-06,133.0,"['maharashtra', 'guj', 'mp']",1,1,1,0.285714
8,edavheen,6.993352e-08,6.0,['maharashtra'],0,0,0,0.333333
9,loobhagaa,1.282115e-07,11.0,['jha'],0,0,0,0.545455


In [43]:
df_merged_prop.to_csv("dnn_pred.csv")

In [None]:
export_csv_gz(processed_df_clean, write_dir)


Writing dataframe to path: /Users/dhingratul/Documents/instate_data/instate_processed_clean.csv.gz
