In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import random
from typing import List

In [2]:
data = pd.read_csv('/content/profit_templete_dataset.csv')

In [3]:
data.shape

(147, 5)

In [4]:
data.tail(50)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,City,Profit
97,75874.2,82456.9,145674.8,Sharm El Sheikh,79647.89
98,63456.8,98743.2,142456.7,Alexandria,76456.78
99,71234.8,85674.3,139874.2,Hurghada,74234.56
100,59876.4,102345.8,136234.8,Giza,72897.43
101,67845.3,78945.3,133674.9,Aswan,71634.21
102,55234.7,95674.2,130987.3,Cairo,69234.67
103,64785.9,81234.7,128456.7,Luxor,67892.89
104,52674.3,98234.5,125674.2,Alexandria,66534.78
105,60987.3,84672.1,122987.6,Mansoura,65187.56
106,48234.7,101234.8,120234.8,Cairo,63897.34


In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import random
from typing import List

class DatasetAugmenter:
    def __init__(self, df: pd.DataFrame, target_rows: int = 500):
        self.df = df.copy()
        self.target_rows = target_rows
        self.numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        self.categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    def add_noise(self, df: pd.DataFrame, noise_factor: float = 0.1) -> pd.DataFrame:
        augmented_df = df.copy()
        for col in self.numeric_columns:
            std_dev = augmented_df[col].std()
            if std_dev == 0:
                std_dev = 1e-4
            noise = np.random.normal(0, std_dev * noise_factor, len(augmented_df))
            augmented_df[col] = augmented_df[col] + noise
        return augmented_df

    def interpolate_rows(self, df: pd.DataFrame, n_samples: int) -> pd.DataFrame:
        augmented_rows = []
        non_zero_df = df[(df[self.numeric_columns] != 0).any(axis=1)]
        for _ in range(n_samples):
            if len(non_zero_df) < 2:
                break
            row1, row2 = non_zero_df.sample(2).to_dict('records')
            new_row = {}
            alpha = random.random()
            for col in df.columns:
                if col in self.numeric_columns:
                    new_row[col] = alpha * row1[col] + (1 - alpha) * row2[col]
                else:
                    new_row[col] = random.choice([row1[col], row2[col]])
            augmented_rows.append(new_row)
        return pd.DataFrame(augmented_rows)


    def smote_like_augmentation(self, df: pd.DataFrame, n_samples: int) -> pd.DataFrame:
        if not self.numeric_columns:
            return self.interpolate_rows(df, n_samples)
        non_zero_df = df[(df[self.numeric_columns] != 0).any(axis=1)].reset_index(drop=True)
        if len(non_zero_df) < 2:
            return self.interpolate_rows(df, n_samples)
        numeric_df = non_zero_df[self.numeric_columns]
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(numeric_df)
        k = min(5, len(non_zero_df) - 1)
        nbrs = NearestNeighbors(n_neighbors=k + 1).fit(scaled_data)
        augmented_rows = []
        for _ in range(n_samples):
            idx = random.randint(0, len(non_zero_df) - 1)
            distances, indices = nbrs.kneighbors([scaled_data[idx]])
            neighbor_idx = random.choice(indices[0][1:])
            row1 = non_zero_df.iloc[idx]
            row2 = non_zero_df.iloc[neighbor_idx]
            alpha = random.random()
            new_row = {}
            for col in df.columns:
                if col in self.numeric_columns:
                    new_row[col] = alpha * row1[col] + (1 - alpha) * row2[col]
                else:
                    new_row[col] = random.choice([row1[col], row2[col]])
            augmented_rows.append(new_row)
        return pd.DataFrame(augmented_rows)

    def bootstrap_sampling(self, df: pd.DataFrame, n_samples: int) -> pd.DataFrame:
        sampled_df = df.sample(n=n_samples, replace=True)
        return self.add_noise(sampled_df, noise_factor=0.05)

    def augment_dataset(self, methods: List[str] = None) -> pd.DataFrame:
        if methods is None:
            methods = ['noise', 'interpolate', 'smote', 'bootstrap']
        current_rows = len(self.df)
        rows_needed = self.target_rows - current_rows
        if rows_needed <= 0:
            print(f"Dataset already has {current_rows} rows, which is >= target of {self.target_rows}")
            return self.df
        print(f"Augmenting dataset from {current_rows} to {self.target_rows} rows")
        augmented_df = self.df.copy()
        rows_per_method = rows_needed // len(methods)
        remaining_rows = rows_needed % len(methods)
        for i, method in enumerate(methods):
            method_rows = rows_per_method + (1 if i < remaining_rows else 0)
            if method_rows == 0:
                continue
            if method == 'noise':
                sampled = self.df.sample(n=method_rows, replace=True)
                new_rows = self.add_noise(sampled)
            elif method == 'interpolate':
                new_rows = self.interpolate_rows(self.df, method_rows)
            elif method == 'smote':
                new_rows = self.smote_like_augmentation(self.df, method_rows)
            elif method == 'bootstrap':
                new_rows = self.bootstrap_sampling(self.df, method_rows)
            else:
                continue
            augmented_df = pd.concat([augmented_df, new_rows], ignore_index=True)
        print(f"Final dataset size: {len(augmented_df)} rows")
        return augmented_df


# ========= MAIN FUNCTION ========= #
def augment_csv(input_file: str, output_file: str, target_rows: int = 500):
    df = pd.read_csv(input_file)
    print(f"\nOriginal dataset shape: {df.shape}")
    augmenter = DatasetAugmenter(df, target_rows=target_rows)
    augmented_df = augmenter.augment_dataset()
    augmented_df.to_csv(output_file, index=False)
    print(f"\nAugmented dataset saved to: {output_file}")
    return augmented_df


# ========= EXECUTION ========== #
if __name__ == "__main__":
    input_csv = "/content/profit_templete_dataset.csv"  # from your uploaded file
    output_csv = "profit_augmented_dataset.csv"

    try:
        augmented_df = augment_csv(input_csv, output_csv, target_rows=500)
        print("\n" + "="*50)
        print("AUGMENTATION SUMMARY")
        print("="*50)
        print(f"Original rows: 150")
        print(f"Final rows: {len(augmented_df)}")
        print(f"Rows added: {len(augmented_df) - 150}")
        print("\nSample rows from result:")
        print(augmented_df.head())
    except Exception as e:
        print(f"An error occurred: {e}")



Original dataset shape: (147, 5)
Augmenting dataset from 147 to 500 rows
Final dataset size: 500 rows

Augmented dataset saved to: profit_augmented_dataset.csv

AUGMENTATION SUMMARY
Original rows: 150
Final rows: 500
Rows added: 350

Sample rows from result:
   R&D Spend  Administration  Marketing Spend             City     Profit
0   125487.3         89654.2         285673.8            Cairo  147892.65
1   118743.6         95421.4         267894.3       Alexandria  142156.78
2   112896.4         78934.2         248573.9             Giza  136574.23
3   108654.7         84672.1         231456.8            Cairo  129847.34
4   104782.3         69834.5         219874.6  Sharm El Sheikh  125634.89


In [6]:
df = pd.read_csv('/content/profit_augmented_dataset.csv')

In [7]:
df.shape

(500, 5)

In [8]:
df.tail(50)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,City,Profit
450,65959.685091,99350.875609,142547.314208,Alexandria,75081.889868
451,16968.449354,123537.707398,72485.687086,Ismailia,49787.304467
452,32656.514938,96458.926696,39314.428844,Giza,34184.373221
453,109015.901403,84737.955797,230167.073924,Cairo,130147.205148
454,78852.826648,83173.124023,156794.714073,Luxor,94611.413382
455,33848.339684,115467.949808,95655.901521,Aswan,54951.447874
456,50650.190533,80737.315249,67233.10366,Alexandria,47582.723851
457,33794.316479,95138.765211,38483.222291,Cairo,32741.538795
458,89331.81294,79119.688259,177194.238159,Alexandria,109691.547794
459,84229.060503,88583.794542,148748.363656,Cairo,93728.14921


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        500 non-null    float64
 1   Administration   500 non-null    float64
 2   Marketing Spend  500 non-null    float64
 3   City             500 non-null    object 
 4   Profit           500 non-null    float64
dtypes: float64(4), object(1)
memory usage: 19.7+ KB
