In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [5]:
class AutoEDA:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def read_data(self): # Function to read data file
        while True:
            data_type = input("Enter the data type (csv, excel): ").lower()
            
            if data_type not in ['csv', 'excel']:
                print("Unsupported data type. Supported types: 'csv', 'excel'")
                continue
            
            data_path = input("Enter the path to the data file: ")
            
            try:
                if data_type == 'csv':
                    data = pd.read_csv(data_path)
                elif data_type == 'excel':
                    data = pd.read_excel(data_path)
            except FileNotFoundError:
                print("File not found. Please check the file path.")
                continue
            
            print("Your data has been loaded")
            return data

    def handle_missing_values(self, inplace=True): # Function to handle missing data
        while True:
            method = input("Enter the method to handle missing values ('mean', 'median', 'drop'): ").lower()

            if method not in ['mean', 'median', 'drop']:
                print("Invalid method. Supported methods: 'mean', 'median', 'drop'")
                continue

            try:
                df = self.dataframe if inplace else self.dataframe.copy()

                if method == 'mean':
                    df.fillna(df.mean(), inplace=True)
                elif method == 'median':
                    df.fillna(df.median(), inplace=True)
                elif method == 'drop':
                    df.dropna(inplace=True)

                print("Missing values handled successfully.")
                return df
            except Exception as e:
                print(f"An error occurred: {e}")
                df = None
                return df
    
    def handle_categorical_data(self, inplace=True): # Function to handle categorical data
        while True:
            method = input("Enter the method to handle categorical values ('OHE', 'LE'): ").lower()

            if method not in ['ohe', 'le']:
                print("Invalid method. Supported methods: 'OHE', 'LE'")
                continue

            try:
                df = self.dataframe if inplace else self.dataframe.copy()
                columns = df.select_dtypes(include=['object']).columns

                if method == 'ohe':
                    df = pd.get_dummies(df, columns=columns, drop_first=True)
                elif method == 'le':
                    encoder = LabelEncoder()
                    for col in columns:
                        df[col] = encoder.fit_transform(df[col])

                print("Categorical data handled successfully.")
                return df
            except Exception as e:
                print(f"An error occurred: {e}")
                df = None
                return df
    
    def scaling_data(self,inplace=True): # Function to scale the data 
        while True : 
            method = input("Enter the method to scale your data ('min_max','standard'): ").lower()
                
            if method not in ['min_max', 'standard']:
                
                print("Invalid method. Supported methods: 'min_max', 'standard'")
                continue
    
            try : 

                df = self.dataframe if inplace else self.dataframe.copy()

                if method == 'min_max' :
                    from sklearn.preprocessing import MinMaxScaler
                    scaler = MinMaxScaler()
                    df = scaler.fit_transform(handled_data)
                    df = pd.DataFrame(df, columns = handled_data.columns)


                elif method == 'standard' :
                    from sklearn.preprocessing import StandardScaler
                    scaler = StandardScaler()
                    df = scaler.fit_transform(handled_data)
                    df = pd.DataFrame(df, columns = handled_data.columns)


            except Exception as e:
                print(f"An error occurred: {e}")
                df = None


            return df

    def create_boxplot(self):
        df = self.dataframe

        while True:
            column = input("Enter the column to show boxplot visualization: ")

            if column not in df.columns:
                print(f"Column '{column}' not found in the DataFrame.")
                continue

            try:
                plt.figure(figsize=(8, 6))
                plt.boxplot(df[column], vert=True)
                plt.title(f"Box Plot of {column}")
                plt.xlabel(column)
                plt.ylabel("Values")
                plt.show()
                break  # Exit the loop if visualization is successful
            except Exception as e:
                print(f"An error occurred: {e}")

        
        
    def create_barchart(self):
        df = self.dataframe
        
        while True : 
            x_column = input("Enter the first column: ")
            y_column = input("Enter the second column: ")
        
            if x_column not in df.columns:
                print(f"Column '{x_column}' not found in the DataFrame.")
                continue

            if y_column not in df.columns:
                print(f"Column '{y_column}' not found in the DataFrame.")
                continue
            
            try:
                plt.figure(figsize=(8, 6))
                plt.bar(df[x_column], df[y_column])
                plt.title(f"Bar Chart of {y_column} vs. {x_column}")
                plt.xlabel(x_column)
                plt.ylabel(y_column)
                plt.show()
                break  
            except KeyError:
                print("One or more columns not found in the DataFrame.")
            except Exception as e:
                print(f"An error occurred: {e}")

    def create_scatter(self):
        df = self.dataframe

        while True : 
            x_column = input("Enter the first column: ")
            y_column = input("Enter the second column: ")
        
            if x_column not in df.columns:
                print(f"Column '{x_column}' not found in the DataFrame.")
                continue

            if y_column not in df.columns:
                print(f"Column '{y_column}' not found in the DataFrame.")
                continue

            try:
                plt.figure(figsize=(8, 6))
                plt.scatter(df[x_column], df[y_column], c='blue', alpha=0.5, s=30)
                plt.title(f"Scatter Plot of {y_column} vs. {x_column}")
                plt.xlabel(x_column)
                plt.ylabel(y_column)
                plt.grid(True)
                plt.show()
                break  
            
            except KeyError:
                print("One or more columns not found in the DataFrame.")
            except Exception as e:
                print(f"An error occurred: {e}")
        
        
    def create_heatmap(self): # Function to create he
        df = self.dataframe
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
        plt.title("Correlation Heatmap")
        plt.show()

In [6]:
eda_tool = AutoEDA(dataframe=None)
loaded_data = eda_tool.read_data()
print("Loaded Data:")
loaded_data.head()

KeyboardInterrupt: Interrupted by user

In [None]:
print('Description of data is:')
loaded_data.describe()

In [None]:
print('Missing values in data are:')
loaded_data.isna().sum()

In [None]:
auto_eda = AutoEDA(loaded_data)
handled_data = auto_eda.handle_missing_values()
handled_data.isna().sum()

In [None]:
auto_eda = AutoEDA(handled_data)
handled_data = auto_eda.handle_categorical_data()
handled_data.head()

In [None]:
auto_eda = AutoEDA(handled_data)
handled_data = auto_eda.scaling_data()
handled_data.head()

In [None]:
auto_eda = AutoEDA(handled_data)
auto_eda.create_boxplot()

In [None]:
auto_eda = AutoEDA(handled_data[0:1000])
auto_eda.create_barchart()

In [None]:
auto_eda = AutoEDA(handled_data)
auto_eda.create_scatter()

In [None]:
auto_eda = AutoEDA(handled_data)
auto_eda.create_heatmap()