## Practical 6: To pre-process the data and clean the data by removing or replacing key values

### Import necessary libraries Numpy and Pandas for data Processing

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Reading the dataset

In [2]:
df = pd.read_csv('Elephantpopulation.csv')
df

Unnamed: 0,STATE,ELEPHANT POPULATION IN 1993,ELEPHANT POPULATION IN 1997,ELEPHANT POPULATION IN 2002,ELEPHANT POPULATION IN 2007,ELEPHANT POPULATION IN 2012,ELEPHANT POPULATION IN 2017
0,Arunachal Pradesh,2102,1800,1607,1690,890,1614
1,Assam,5524,5312,5246,5281,5620,5719
2,Meghalaya,2872,1840,1868,1811,1811 (Census not conducted),1754
3,Nagaland,178,158,145,152,212,446
4,Mizoram,15,22,33,12,Census not conducted,7
5,Manipur,50,30,12,Census not conducted,Census not conducted,9
6,Tripura,100,70,40,59,59,102
7,West Bengal (North),186,250,292,300-350,647,488
8,West Bengal (South),14,26,36,25,North and South Bengals are combined.,194
9,Jharkhand,550,618,772,624,688,679


### Replacing string "Census not conducted" with Nan

In [3]:
df.replace("Census not conducted", np.nan, inplace=True)
df

Unnamed: 0,STATE,ELEPHANT POPULATION IN 1993,ELEPHANT POPULATION IN 1997,ELEPHANT POPULATION IN 2002,ELEPHANT POPULATION IN 2007,ELEPHANT POPULATION IN 2012,ELEPHANT POPULATION IN 2017
0,Arunachal Pradesh,2102.0,1800.0,1607.0,1690,890,1614
1,Assam,5524.0,5312.0,5246.0,5281,5620,5719
2,Meghalaya,2872.0,1840.0,1868.0,1811,1811 (Census not conducted),1754
3,Nagaland,178.0,158.0,145.0,152,212,446
4,Mizoram,15.0,22.0,33.0,12,,7
5,Manipur,50.0,30.0,12.0,,,9
6,Tripura,100.0,70.0,40.0,59,59,102
7,West Bengal (North),186.0,250.0,292.0,300-350,647,488
8,West Bengal (South),14.0,26.0,36.0,25,North and South Bengals are combined.,194
9,Jharkhand,550.0,618.0,772.0,624,688,679


### Replacing ranges with the mean of low and high

In [4]:
def convert_range_to_avg(value):
    if isinstance(value, str) and '-' in value:
        low, high = map(int, value.split('-'))
        return (low + high) / 2
    return value

for col in df.columns[1:]:
    df[col] = df[col].apply(convert_range_to_avg)

df

Unnamed: 0,STATE,ELEPHANT POPULATION IN 1993,ELEPHANT POPULATION IN 1997,ELEPHANT POPULATION IN 2002,ELEPHANT POPULATION IN 2007,ELEPHANT POPULATION IN 2012,ELEPHANT POPULATION IN 2017
0,Arunachal Pradesh,2102.0,1800.0,1607.0,1690.0,890,1614
1,Assam,5524.0,5312.0,5246.0,5281.0,5620,5719
2,Meghalaya,2872.0,1840.0,1868.0,1811.0,1811 (Census not conducted),1754
3,Nagaland,178.0,158.0,145.0,152.0,212,446
4,Mizoram,15.0,22.0,33.0,12.0,,7
5,Manipur,50.0,30.0,12.0,,,9
6,Tripura,100.0,70.0,40.0,59.0,59,102
7,West Bengal (North),186.0,250.0,292.0,325.0,647,488
8,West Bengal (South),14.0,26.0,36.0,25.0,North and South Bengals are combined.,194
9,Jharkhand,550.0,618.0,772.0,624.0,688,679


### Converting the string to numeric data type 

In [5]:
def is_numeric_string(value):
    try:
        # Try to convert the value to an integer
        return int(value)
    except ValueError:
        # If conversion fails, return NaN
        return np.nan

# Apply the function to the DataFrame, column by column (excluding the 'State' column)
df.iloc[:, 1:] = df.iloc[:, 1:].applymap(is_numeric_string)
df

Unnamed: 0,STATE,ELEPHANT POPULATION IN 1993,ELEPHANT POPULATION IN 1997,ELEPHANT POPULATION IN 2002,ELEPHANT POPULATION IN 2007,ELEPHANT POPULATION IN 2012,ELEPHANT POPULATION IN 2017
0,Arunachal Pradesh,2102.0,1800.0,1607.0,1690.0,890.0,1614
1,Assam,5524.0,5312.0,5246.0,5281.0,5620.0,5719
2,Meghalaya,2872.0,1840.0,1868.0,1811.0,,1754
3,Nagaland,178.0,158.0,145.0,152.0,212.0,446
4,Mizoram,15.0,22.0,33.0,12.0,,7
5,Manipur,50.0,30.0,12.0,,,9
6,Tripura,100.0,70.0,40.0,59.0,59.0,102
7,West Bengal (North),186.0,250.0,292.0,325.0,647.0,488
8,West Bengal (South),14.0,26.0,36.0,25.0,,194
9,Jharkhand,550.0,618.0,772.0,624.0,688.0,679


### Replacing the Nan values with the median of each column

In [6]:
df['ELEPHANT POPULATION IN 1993'].fillna(df['ELEPHANT POPULATION IN 1993'].median(), inplace=True)
df['ELEPHANT POPULATION IN 1997'].fillna(df['ELEPHANT POPULATION IN 1997'].median(), inplace=True)
df['ELEPHANT POPULATION IN 2002'].fillna(df['ELEPHANT POPULATION IN 2002'].median(), inplace=True)
df['ELEPHANT POPULATION IN 2007'].fillna(df['ELEPHANT POPULATION IN 2007'].median(), inplace=True)
df['ELEPHANT POPULATION IN 2012'].fillna(df['ELEPHANT POPULATION IN 2012'].median(), inplace=True)
df['ELEPHANT POPULATION IN 2017'].fillna(df['ELEPHANT POPULATION IN 2017'].median(), inplace=True)
df

Unnamed: 0,STATE,ELEPHANT POPULATION IN 1993,ELEPHANT POPULATION IN 1997,ELEPHANT POPULATION IN 2002,ELEPHANT POPULATION IN 2007,ELEPHANT POPULATION IN 2012,ELEPHANT POPULATION IN 2017
0,Arunachal Pradesh,2102.0,1800.0,1607.0,1690.0,890.0,1614
1,Assam,5524.0,5312.0,5246.0,5281.0,5620.0,5719
2,Meghalaya,2872.0,1840.0,1868.0,1811.0,688.0,1754
3,Nagaland,178.0,158.0,145.0,152.0,212.0,446
4,Mizoram,15.0,22.0,33.0,12.0,688.0,7
5,Manipur,50.0,30.0,12.0,624.0,688.0,9
6,Tripura,100.0,70.0,40.0,59.0,59.0,102
7,West Bengal (North),186.0,250.0,292.0,325.0,647.0,488
8,West Bengal (South),14.0,26.0,36.0,25.0,688.0,194
9,Jharkhand,550.0,618.0,772.0,624.0,688.0,679
