## Data cleaning and preprocessing with Pandas


In [69]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, PolynomialFeatures

# Load the dataset
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
print("DataFrame loaded from CSV file:\n", df.head())

# 1. Identify missing values in the DataFrame.
def identify_missing_values(dataframe):
    return dataframe.isnull().sum()

print("\nMissing values in each column:\n", identify_missing_values(df))

# 2. Drop rows with any missing values.
def drop_rows_with_missing_values(dataframe):
    return dataframe.dropna()

df_dropped_rows = drop_rows_with_missing_values(df)
print("\nDataFrame after dropping rows with missing values:\n", df_dropped_rows.head())

# 3. Drop columns with any missing values.
def drop_columns_with_missing_values(dataframe):
    return dataframe.dropna(axis=1)

df_dropped_cols = drop_columns_with_missing_values(df)
print("\nDataFrame after dropping columns with missing values:\n", df_dropped_cols.head())

# 4. Fill missing values with a specific value.
def fill_missing_values_with_value(dataframe, value):
    return dataframe.fillna(value)

df_filled = fill_missing_values_with_value(df, 0)
print("\nDataFrame after filling missing values with 0:\n", df_filled.head())

# 5. Fill missing values using forward fill and backward fill methods.
def fill_missing_values_ffill(dataframe):
    return dataframe.ffill()

def fill_missing_values_bfill(dataframe):
    return dataframe.bfill()

df_filled_ffill = fill_missing_values_ffill(df)
print("\nDataFrame after forward filling missing values:\n", df_filled_ffill.head())

df_filled_bfill = fill_missing_values_bfill(df)
print("\nDataFrame after backward filling missing values:\n", df_filled_bfill.head())

# 6. Interpolate missing values.
def interpolate_missing_values(dataframe):
    dataframe = dataframe.infer_objects(copy=False)
    return dataframe.apply(lambda col: col.interpolate() if col.dtype != 'object' else col)

df_interpolated = interpolate_missing_values(df)
print("\nDataFrame after interpolating missing values:\n", df_interpolated.head())

# 7. Convert a column to a different data type.
def convert_column_type(dataframe, column_name, new_type):
    dataframe[column_name] = dataframe[column_name].astype(new_type)
    return dataframe

df_converted = convert_column_type(df, 'Age', 'float')
print("\nDataFrame after converting 'Age' column to float:\n", df_converted.head())

# 8. Apply a function to transform the values of a column.
def transform_column_values(dataframe, column_name, func):
    dataframe[column_name] = dataframe[column_name].apply(func)
    return dataframe

df_transformed = transform_column_values(df, 'MonthlyIncome', lambda x: x / 1000)
print("\nDataFrame after transforming 'MonthlyIncome' column values:\n", df_transformed.head())

# 9. Normalize a column using Min-Max scaling.
def normalize_column(dataframe, column_name):
    scaler = MinMaxScaler()
    dataframe[column_name] = scaler.fit_transform(dataframe[[column_name]])
    return dataframe

df_normalized = normalize_column(df, 'MonthlyIncome')
print("\nDataFrame after normalizing 'MonthlyIncome' column:\n", df_normalized.head())

# 10. Standardize a column (z-score normalization).
def standardize_column(dataframe, column_name):
    scaler = StandardScaler()
    dataframe[column_name] = scaler.fit_transform(dataframe[[column_name]])
    return dataframe

df_standardized = standardize_column(df, 'MonthlyIncome')
print("\nDataFrame after standardizing 'MonthlyIncome' column:\n", df_standardized.head())

# 11. Identify duplicate rows in the DataFrame.
def identify_duplicate_rows(dataframe):
    return dataframe[dataframe.duplicated()]

duplicates = identify_duplicate_rows(df)
print("\nDuplicate rows in DataFrame:\n", duplicates)

# 12. Drop duplicate rows.
def drop_duplicate_rows(dataframe):
    return dataframe.drop_duplicates()

df_no_duplicates = drop_duplicate_rows(df)
print("\nDataFrame after dropping duplicate rows:\n", df_no_duplicates.head())

# 13. Drop duplicate rows based on specific columns.
def drop_duplicate_rows_specific_columns(dataframe, column_names):
    return dataframe.drop_duplicates(subset=column_names)

df_no_duplicates_specific = drop_duplicate_rows_specific_columns(df, ['EmployeeNumber'])
print("\nDataFrame after dropping duplicate rows based on 'EmployeeNumber':\n", df_no_duplicates_specific.head())

# 14. Convert all string values in a column to lowercase.
def convert_column_to_lowercase(dataframe, column_name):
    dataframe[column_name] = dataframe[column_name].str.lower()
    return dataframe

df_lowercase = convert_column_to_lowercase(df, 'Department')
print("\nDataFrame after converting 'Department' column to lowercase:\n", df_lowercase.head())

# 15. Remove leading and trailing spaces from string values in a column.
def strip_spaces_from_column(dataframe, column_name):
    dataframe[column_name] = dataframe[column_name].str.strip()
    return dataframe

df_stripped = strip_spaces_from_column(df, 'Department')
print("\nDataFrame after stripping spaces from 'Department' column:\n", df_stripped.head())

# 16. Replace a specific substring in a column with another substring.
# 17. Extract a substring from each value in a column.
# 18. Convert a column to datetime format.
# 19. Extract year, month, and day from a datetime column.
# 20. Filter rows based on a date range.
# 21. Convert a categorical column to numerical using one-hot encoding.
# 22. Convert a categorical column to numerical using label encoding.
# 23. Group values in a categorical column and create a new column with grouped categories.
# 24. Merge two DataFrames based on a common column.
# 25. Concatenate two DataFrames vertically.
# 26. Concatenate two DataFrames horizontally.
# 27. Create a new column based on existing columns.
# 28. Discretize a continuous column into bins.
# 29. Create polynomial features from existing numerical columns.


DataFrame loaded from CSV file:
    Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction Standard