In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

def converting_to_numeral(csv_path, output_path=None):
    df = pd.read_csv(csv_path)

    # Mapping dictionaries for 'cut', 'clarity', and 'color'
    cut_mapping = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
    clarity_mapping = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}

    color_mapping = {'D': 1, 'E': 2, 'F': 3, 'G': 4, 'H': 5, 'I': 6, 'J': 7, 'K': 8, 'L': 9, 'M': 10,
                     'N': 11, 'O': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'U': 18, 'V': 19, 'W': 20,
                     'X': 21, 'Y': 22, 'Z': 23}

    # Convert 'cut', 'clarity', and 'color' to integers
    df['cut'] = df['cut'].map(cut_mapping)
    df['clarity'] = df['clarity'].map(clarity_mapping)
    df['color'] = df['color'].map(color_mapping)

    # Calculate 'z_depth' and 'table_width' columns
    df['z_depth'] = df['depth'] * df['z'] * 100
    df['table_width'] = df['table'] * df['x'] * 100

    # Drop rows with NaN values (if any)
    df.dropna(inplace=True)

    # Save the preprocessed DataFrame to a new CSV file
    if output_path is None:
        output_path = csv_path.replace('.csv', '_preprocessed.csv')
    df.to_csv(output_path, index=False)

    print(f"Preprocessed dataset saved at: {output_path}")

    return output_path

def regularize_original_file(dataset_path):
    data = pd.read_csv(dataset_path)
    
    # List of columns to keep
    columns_regulirized = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']

    # Drop columns not in the specified set
    data_regularized = data[columns_regularized]

    # Save the regularized dataset
    regularized_path = dataset_path.replace('.csv', '_regularized.csv')
    data_regularized.to_csv(regularized_path, index=False)
    return regularized_path


def test_reg_file(dataset_path):
    columns_regulirized = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']
    
    # Read the original dataset
    data = pd.read_csv(dataset_path)
    columns_file = list(data.columns)
    
    if columns_file == columns_regulirized:
        return True
    else:
        return False
    

def load_and_preprocess_data(dataset_path):
    
    # Column checker    
    data = pd.read_csv(dataset_path)
    X = data.drop(columns=['price'])
    y = data['price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

# Function to evaluate the model
def evaluate_model(model, X_test, y_test, margin):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    absolute_diff = abs(y_test - y_pred)
    within_margin = sum(absolute_diff <= margin * y_test)
    percentage_within_margin = (within_margin / len(y_test)) * 100
    return mse, percentage_within_margin


def save_model(model, model_path):
    joblib.dump(model, model_path)
    print(f"Model saved at {model_path}")
"""
#Divide in 3 tracks of price
def detailed_evaluatio_model(model, X_test, y_test, margin):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse
"""
"""
Long-tailed to deca 1 or deca 10
def detailed_evaluatio_model(model, X_test, y_test, margin,tail):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse
"""
    
def plot_model(df_visualization, margin):
    underestimate_threshold=0
    df_visualization['Underestimated'] = df_visualization['Price Difference'] < underestimate_threshold
    df_visualization['Overestimated'] = df_visualization['Price Difference'] > underestimate_threshold

    # Filter out samples within a margin
    avg_abs_diff_overestimated = df_visualization.loc[df_visualization['Overestimated'], 'Absolute Difference'].mean()
    avg_abs_diff_underestimated = df_visualization.loc[df_visualization['Underestimated'], 'Absolute Difference'].mean()
    df_filtered_overestimated = df_visualization.loc[df_visualization['Overestimated'] & (df_visualization['Absolute Difference'] > (1 + margin) * avg_abs_diff_overestimated)]
    df_filtered_underestimated = df_visualization.loc[df_visualization['Underestimated'] & (df_visualization['Absolute Difference'] > (1 + margin) * avg_abs_diff_underestimated)]

    # Count the number of original samples
    total_samples = len(df_visualization)

    # Count the number of samples after filtering
    total_samples_filtered = len(df_filtered_overestimated) + len(df_filtered_underestimated)
    total_overestimated_filtered = len(df_filtered_overestimated)
    total_underestimated_filtered = len(df_filtered_underestimated)
    
    # Calculate average and standard deviation for overestimated and underestimated samples after filtering
    avg_overestimated_filtered = df_filtered_overestimated['Absolute Difference'].mean()
    std_overestimated_filtered = df_filtered_overestimated['Absolute Difference'].std()
    avg_underestimated_filtered = df_filtered_underestimated['Absolute Difference'].mean()
    std_underestimated_filtered = df_filtered_underestimated['Absolute Difference'].std()

    # Print the evaluation metrics
    print(f'Total original samples: {total_samples}')
    print(f'Total samples after filtering ( {100*margin} %): {total_samples_filtered} ({total_samples_filtered / total_samples * 100:.2f}%)')
    print(f'Total overestimated samples after filtering (> {underestimate_threshold} and margin {margin}): {total_overestimated_filtered} '
          f'({total_overestimated_filtered / total_samples * 100:.2f}%)')
    print(f'Total underestimated samples after filtering (< {underestimate_threshold} and margin {margin}): {total_underestimated_filtered} '
          f'({total_underestimated_filtered / total_samples * 100:.2f}%)')
    #print(f'Average absolute difference for overestimated samples after filtering: {avg_overestimated_filtered:.2f}')
    #print(f'Standard deviation for overestimated samples after filtering: {std_overestimated_filtered:.2f}')
    #print(f'Average absolute difference for underestimated samples after filtering: {avg_underestimated_filtered:.2f}')
    #print(f'Standard deviation for underestimated samples after filtering: {std_underestimated_filtered:.2f}')

    # Plot the differences with actual prices on the x-axis after filtering
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='Actual Prices', y='Price Difference', hue='Predicted Prices', data=df_filtered_overestimated)
    sns.scatterplot(x='Actual Prices', y='Price Difference', hue='Predicted Prices', data=df_filtered_underestimated)

    # Set up the plot for the evaluated sample in red
    evaluated_sample_index = i  # Assuming 'i' is defined elsewhere
    evaluated_sample_actual_price = y_test.iloc[evaluated_sample_index]
    evaluated_sample_predicted_price = prediction
    evaluated_sample_difference = evaluated_sample_predicted_price - evaluated_sample_actual_price

    # Calculate the percentage out from the threshold zero point
    percentage_out = (evaluated_sample_difference / avg_overestimated_filtered) * 100

    plt.scatter(evaluated_sample_actual_price, evaluated_sample_difference, color='red', marker='X', s=100, label=f'Evalu. Sample Î”{percentage_out:.2f}% ')

    # Plot horizontal lines
    plt.axhline(y=0, color='black', linestyle='--', label='Zero Difference')
    plt.axhline(y=avg_overestimated_filtered, color='red', linestyle='--', label=f'Avg Overestimated: {avg_overestimated_filtered:.2f}')
    plt.axhline(y=-avg_underestimated_filtered, color='blue', linestyle='--', label=f'Avg Underestimated: {avg_underestimated_filtered:.2f}')

    plt.xlabel('Actual Prices')
    plt.ylabel('Price Difference (Predicted - Actual)')
    plt.title(f'Linear Regression: Differences between Predicted and Actual Prices (Margin={margin * 100}%)')
    plt.legend(title='Predicted Prices')
    plt.show()

def explain(model, explainer, X_test, instance_index, feature_names):
    instance = X_test.iloc[instance_index].values
    prediction = model.predict([instance])[0]

    explanation = explainer.explain_instance(instance, model.predict, num_features=len(feature_names))

    # Print explanation
    print("LIME Explanation:")
    print(f"Predicted Price: {prediction}")
    print("Feature Weights:")
    for feature, weight in explanation.as_list():
        print(f"{feature}: {weight}")
    explanation.show_in_notebook()
    
def train_model_buffer(data_directory, model_path, buffer_sleep_time, margin):
    while True:
        for dataset_file in os.listdir(data_directory):
            dataset_path = os.path.join(data_directory, dataset_file)

            # Load and preprocess the data
            X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset_path)

            # Train the model
            model = train_model(X_train, y_train)

            # Evaluate the model
            mse, percentage_within_margin = evaluate_model(model, X_test, y_test, margin)
            print(f'Mean Squared Error: {mse}')
            print(f'Percentage of Samples within {margin * 100}%: {percentage_within_margin:.2f}%')

            # Save the trained model
            save_model(model, model_path)

            # Move the processed dataset file to another directory or delete it
            processed_dataset_path = os.path.join(data_directory, 'processed', dataset_file)
            os.rename(dataset_path, processed_dataset_path)

        print(f"Waiting for {buffer_sleep_time} seconds before checking for new datasets...")
        time.sleep(buffer_sleep_time)
    


def main():
    buffer_time = 3600
    margin5 = 0.05
    margin10 = 0.1
    tail = 0.2
    # Specify the directory where new datasets are added
    data_directory = "C:\\Users\\cassiorodrigo.crisfa\\Documents\\GitHub\\xtream-ai-assignment\\Cristani\\buffer"

    # Specify the path to save the trained model
    model_path = "C:\\Users\\cassiorodrigo.crisfa\\Documents\\GitHub\\xtream-ai-assignment\\Cristani\\model.joblib"

    # Loop through each dataset in the directory
    for dataset_file in os.listdir(data_directory):
        dataset_path = os.path.join(data_directory, dataset_file)

        # Load and preprocess the data
        X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset_path)

        # Train the model
        model = train_model(X_train, y_train)

        # Evaluate the model
        mse, percentage = evaluate_model(model, X_test, y_test, margin10)
        print(f'Mean Squared Error: {mse}')
        print(f'Percentage of Samples within {margin10 * 100}%: {percentage_within_margin:.2f}%')

        # Save the trained model
        save_model(model, model_path)
        
        #Explain the model
        explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, mode="regression", feature_names=X.columns)
        sample = 555
        explain(model, explainer, X_test, sample, X.columns)

    
if __name__ == "__main__":
    main()
