In [147]:
import pandas as pd
import os
import PyPDF2 as p
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

### LOADING IN THE DATA

In [148]:
file_location = ('/Users/ankitkothari/Documents/COMPLETED_PROJECTS/H1B_data_analysis/h1b_dump_data/')
os.chdir(file_location)
data=pd.DataFrame()
for i, k in enumerate(os.listdir('.')):
    if k.endswith('.csv'):
        df= pd.DataFrame(pd.read_csv(k))
        data = pd.concat([df, data])
data=data.drop(columns=['Tax ID'])

### BASELINE DATA USAGE

In [149]:
print(f' columns {data.columns}')
print(f'data types {data.info(memory_usage="deep")}')
print(f' Memory usage in MB \n {data.memory_usage(deep=True).sort_values()/(1024*1024)}')
baseline_columns = [col for col in data.columns]
baseline_usage = [round(data[col].memory_usage(deep=True)/(1024*1024),0) for col in data.columns]

 columns Index(['City', 'Continuing Approvals', 'Continuing Denials', 'Employer',
       'Fiscal Year', 'Initial Approvals', 'Initial Denials', 'NAICS', 'State',
       'ZIP'],
      dtype='object')


KeyboardInterrupt: 

### OPTIMIZING COLUMNS

In [None]:
data_preprocessing_start_time=time.time()

category_columns=['State','City']
string_columns=['Initial Approvals','Initial Denials','Continuing Approvals','Continuing Denials']
int_columns=data.select_dtypes(include=['int']).columns
print(int_columns)

### OPTIMIZING INT

In [None]:
for col in int_columns:
  data[col] = pd.to_numeric(data[col],errors='coerce', downcast='integer')
  print(f'Optimized Memory use in column name {col} in Category Data type  {col} {data[col].memory_usage(deep=True)/(1024 ** 2):03.2f} MB')


### OPTIMIZING OBJECT COLUMNS

In [None]:
for col in category_columns:
    num_unique_values = len(data[col].unique())
    num_total_values = len(data[col])
    if num_unique_values / num_total_values < 0.5:
        start= time.time()
        data[col] = data[col].astype('category')
        total_time+=time.time()-start
        print(f'Optimized Memory use in column name {col} in Category Data type  {col} {data[col].memory_usage(deep=True)/(1024 ** 2):03.2f} MB')


### OPTIMIZING FLOAT

In [None]:
string_columns=['Initial Approvals','Initial Denials','Continuing Approvals','Continuing Denials']
for col in string_columns:
    num_unique_values = len(data[col].unique())
    num_total_values = len(data[col])
    #print(f'Ratio of unique values to length of  {col} is {(num_unique_values/num_total_values):03.2f}')
    print(f'Memory Use in in column name {col} Object Data type {col} {data[col].memory_usage(deep=True)/(1024 ** 2):03.2f} MB')
    if num_unique_values / num_total_values < 0.5:
        start_float= time.time()
        data[col] = pd.to_numeric(data[col],errors='coerce', downcast='float')
        total_time_float+=time.time()-start_float
        print(f'Optimized Memory use in column name {col} in Category Data type  {col} {data[col].memory_usage(deep=True)/(1024 ** 2):03.2f} MB')
data_preprocessing_optimization_time= (time.time()-data_preprocessing_start_time)/60

### TIME TAKEN FOR MEMORY OPTIMIZATION

In [None]:
print(f'Time Taken for Data Preprocessing Memory Optimization  {data_preprocessing_optimization_time:03.2f} mins')

### OPTIMIZED MEMORY USAGE

In [None]:
print(f' Memory usage in MB \n {data.memory_usage(deep=True).sort_values()/(1024*1024)}')
print(f'data types {data.info(memory_usage="deep")}')

### CODES FOR COLUMNS CONVERTED TO CATEGORIES

In [None]:
d = dict(enumerate(data['State'].cat.categories))
print(d)
print(data['City'].cat.categories)

### VISUALIZATION OF MEMORY OPTIMIZATION 

In [None]:
columns = [col for col in data.columns]
optimized_usage = [round(data[col].memory_usage(deep=True)/(1024*1024),0) for col in data.columns]

In [None]:
columns

In [None]:
baseline_usage


In [None]:
optimized_usage 

In [None]:
df = pd.DataFrame({
    'baseline_usage': baseline_usage,
    'optimized_usage': optimized_usage,
}, index=baseline_columns)
plt.style.use('seaborn')
ax = df.plot.area(stacked=False)
ax.set_xlabel('Column Names',fontsize=16)
ax.set_ylabel('Memory Usage in MB',fontsize=16)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.title('Memory Optimization for columns from 4.4gb to 1.2gb',fontsize=16)
plt.show()

In [None]:
data.head()

In [None]:
data.to_csv('./optimized_h1b_data.csv')