In [1]:
import numpy as np 
import pandas as pd

class Dataprepkit:
    def __init__(self):
        self.data = None
    
    def read_data(self, file_path, file_format):
    
        try:
            if file_format == 'csv':
                self.data = pd.read_csv(file_path)
            elif file_format == 'excel':
                self.data = pd.read_excel(file_path)
            elif file_format == 'json':
                self.data = pd.read_json(file_path)
            else:
                raise ValueError("Unsupported file format")
                
        except Exception as e:
            print(f"Error reading file: {str(e)}")
            
    
    def gen_summary(self):
        if self.data is not None:
            return self.data.describe()
        else:
            print("No data available.")
            
    def most_frequent_values(self):
        if self.data is not None:
            return self.data.mode().iloc[0]
        else:
            print("No data available.")
            
            
            
    
    def handle_missing_values(self, handling_Type):
        try:
            if handling_Type == 'mean':
                return self.data.fillna(self.data.mean())
            elif handling_Type == 'median':
                return self.data.fillna(self.data.median())
            elif handling_Type == 'drop':
                return self.data.dropna()
            else:
                raise ValueError("Unsupported missing value handling strategy")
        except Exception as e:
            print(f"Error handling missing values: {str(e)}")
            
            
    
    def one_hot_encoding(self, columns):
        if self.data is not None:
            return pd.get_dummies(self.data, columns=columns)
        else:
            print("No data available.")
    
    def label_encoding(self, column):
        if self.data is not None:
            labels = self.data[column].astype('category').cat.codes
            return labels
        else:
            print("No data available.")
            
            

file_path = input("Enter your file path: ")
file_format = input("Enter the file format: ")
obj = Dataprepkit()
obj.read_data(file_path , file_format)

Enter your file path: E:\term 2\Data Analysis\Amgad\Lect 2- Descriptive Statistics\Flavors.csv
Enter the file format: csv


In [2]:
obj.data

Unnamed: 0,Flavor,Base Flavor,Liked,Flavor Rating,Texture Rating,Total Rating
0,Mint Chocolate Chip,Vanilla,Yes,10.0,8.0,18.0
1,Chocolate,Chocolate,Yes,8.8,7.6,16.6
2,Vanilla,Vanilla,No,4.7,5.0,9.7
3,Cookie Dough,Vanilla,Yes,6.9,6.5,13.4
4,Rocky Road,Chocolate,Yes,8.2,7.0,15.2
5,Pistachio,Vanilla,No,2.3,,5.7
6,Cake Batter,Vanilla,Yes,6.5,6.0,12.5
7,Neapolitan,Vanilla,No,3.8,5.0,8.8
8,Chocolte Fudge Brownie,Chocolate,Yes,8.2,7.1,15.3


In [3]:
obj.gen_summary()

Unnamed: 0,Flavor Rating,Texture Rating,Total Rating
count,9.0,8.0,9.0
mean,6.6,6.525,12.8
std,2.5387,1.122179,4.030509
min,2.3,5.0,5.7
25%,4.7,5.75,9.7
50%,6.9,6.75,13.4
75%,8.2,7.225,15.3
max,10.0,8.0,18.0


In [4]:
obj.most_frequent_values()

Flavor            Cake Batter
Base Flavor           Vanilla
Liked                     Yes
Flavor Rating             8.2
Texture Rating            5.0
Total Rating              5.7
Name: 0, dtype: object

In [5]:
obj.handle_missing_values('mean')

  return self.data.fillna(self.data.mean())


Unnamed: 0,Flavor,Base Flavor,Liked,Flavor Rating,Texture Rating,Total Rating
0,Mint Chocolate Chip,Vanilla,Yes,10.0,8.0,18.0
1,Chocolate,Chocolate,Yes,8.8,7.6,16.6
2,Vanilla,Vanilla,No,4.7,5.0,9.7
3,Cookie Dough,Vanilla,Yes,6.9,6.5,13.4
4,Rocky Road,Chocolate,Yes,8.2,7.0,15.2
5,Pistachio,Vanilla,No,2.3,6.525,5.7
6,Cake Batter,Vanilla,Yes,6.5,6.0,12.5
7,Neapolitan,Vanilla,No,3.8,5.0,8.8
8,Chocolte Fudge Brownie,Chocolate,Yes,8.2,7.1,15.3


In [6]:
obj.label_encoding('Liked')

0    1
1    1
2    0
3    1
4    1
5    0
6    1
7    0
8    1
dtype: int8

In [7]:
obj.one_hot_encoding(['Base Flavor'])

Unnamed: 0,Flavor,Liked,Flavor Rating,Texture Rating,Total Rating,Base Flavor_Chocolate,Base Flavor_Vanilla
0,Mint Chocolate Chip,Yes,10.0,8.0,18.0,0,1
1,Chocolate,Yes,8.8,7.6,16.6,1,0
2,Vanilla,No,4.7,5.0,9.7,0,1
3,Cookie Dough,Yes,6.9,6.5,13.4,0,1
4,Rocky Road,Yes,8.2,7.0,15.2,1,0
5,Pistachio,No,2.3,,5.7,0,1
6,Cake Batter,Yes,6.5,6.0,12.5,0,1
7,Neapolitan,No,3.8,5.0,8.8,0,1
8,Chocolte Fudge Brownie,Yes,8.2,7.1,15.3,1,0
