## Importing Required Libraries

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Loading Dataset

In [2]:
curr_dir=os.getcwd()
data_path=os.path.join(curr_dir,'Dataset/data.csv')

In [3]:
data=pd.read_csv(data_path)

## Understanding dataset

In [4]:
data.shape

(4706, 6)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4706 entries, 0 to 4705
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job title          4706 non-null   object 
 1   AI Impact          4706 non-null   object 
 2   Tasks              4706 non-null   int64  
 3   AI models          4706 non-null   int64  
 4   AI_Workload_Ratio  4706 non-null   float64
 5   Domain             4706 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 220.7+ KB


In [6]:
data.head()

Unnamed: 0,Job title,AI Impact,Tasks,AI models,AI_Workload_Ratio,Domain
0,Communications Manager,98%,365,2546,0.143362,Communication & PR
1,Data Collector,95%,299,2148,0.139199,Data & IT
2,Data Entry,95%,325,2278,0.142669,Administrative & Clerical
3,Mail Clerk,95%,193,1366,0.141288,Administrative & Clerical
4,Compliance Officer,92%,194,1369,0.141709,Medical & Healthcare


In [7]:
data['Domain'].unique()

array(['Communication & PR', 'Data & IT', 'Administrative & Clerical',
       'Medical & Healthcare', 'Leadership & Strategy', 'Law Enforcement',
       'Construction', 'Sales & Marketing', 'Hospitality',
       'Supply Chain & Logistics'], dtype=object)

In [8]:
data.describe()

Unnamed: 0,Tasks,AI models,AI_Workload_Ratio
count,4706.0,4706.0,4706.0
mean,400.708032,1817.678071,inf
std,311.564781,1086.853037,
min,1.0,0.0,0.036585
25%,161.0,1085.25,0.137271
50%,270.0,1577.5,0.199281
75%,608.75,2273.0,0.260572
max,1387.0,5666.0,inf


## Replacing inf values

In [9]:
data['AI_Workload_Ratio'].replace([np.inf,-np.inf],np.nan,inplace=True)

In [10]:
data.describe()

Unnamed: 0,Tasks,AI models,AI_Workload_Ratio
count,4706.0,4706.0,4699.0
mean,400.708032,1817.678071,0.204619
std,311.564781,1086.853037,0.076547
min,1.0,0.0,0.036585
25%,161.0,1085.25,0.137262
50%,270.0,1577.5,0.198885
75%,608.75,2273.0,0.26045
max,1387.0,5666.0,1.0


## Data cleaning

In [11]:
data['AI Impact']=data['AI Impact'].str.strip('%').astype(float)/100
data['AI Impact']

0       0.98
1       0.95
2       0.95
3       0.95
4       0.92
        ... 
4701    0.05
4702    0.05
4703    0.05
4704    0.05
4705    0.05
Name: AI Impact, Length: 4706, dtype: float64

In [12]:
data.drop_duplicates(inplace=True)

## Feature extraction

In [13]:
def impact_level(impact):
    if impact >= 0.9:
        return 'Revolutionary'
    elif 0.75 <= impact < 0.9:
        return 'Significant'
    elif 0.5 <= impact < 0.75:
        return 'Moderate'
    elif 0.25 <= impact < 0.5:
        return 'Minor'
    else:
        return 'Negligible'

data['Impact_Level'] = data['AI Impact'].apply(impact_level)


In [14]:
data['Impact_Level'].value_counts()

Impact_Level
Negligible       1993
Minor            1909
Moderate          674
Significant       110
Revolutionary      20
Name: count, dtype: int64

In [16]:
data.to_csv('Dataset/Clean_Data.csv',index=False)