In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn 
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from  sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA

class DataIngestion:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = self.load_data()

    def load_data(self):
        file_extension = os.path.splitext(self.file_path)[1]
        if file_extension == '.csv':
            return pd.read_csv(self.file_path)
        elif file_extension == '.json':
            return pd.read_json(self.file_path)
        elif file_extension == '.xlsx':
            return pd.read_excel(self.file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")

    def clean_data(self):
        # Handling Missing values and Imputing missing rows
        self.df["Gold"] = self.df['Gold'].fillna(0)
        self.df["Silver"] = self.df['Silver'].fillna(0)
        self.df["Bronze"] = self.df['Bronze'].fillna(0)
        self.df["Total"] = self.df['Gold'] + self.df['Silver'] + self.df['Bronze']
        self.df = self.df.dropna(subset=['Country', 'Country Code'])


    def get_dataframe(self):
        return self.df
        
def generate_summaries(df):

    print(f"\nThe shape of the dataset: {df.shape}\n")
    print(f"Number of missing values by features:\n{df.isnull().sum()}\n")
    print(f"Central tendencies of dataset:\n{df.describe()}\n")
    return 

def generate_visualizations(df):
    # Plot distribution of Gold, Silver, and Bronze medals
    plt.figure(figsize=(12, 6))
    sns.histplot(df[['Gold', 'Silver', 'Bronze']], kde=True, bins=10)
    plt.title('Distribution of Gold, Silver, and Bronze Medals')
    plt.xlabel('Number of Medals')
    plt.ylabel('Frequency')
    plt.legend(['Gold', 'Silver', 'Bronze'])
    plt.show(block=True)

    #Total medals vs Country (Sorted)
    plt.figure(figsize=(12, 6))
    df_sorted = df.sort_values(by='Total', ascending=False)
    ax = sns.barplot(x='Country', y='Total', data=df_sorted, hue='Country', palette='viridis', legend=False)
    ax.tick_params(axis='x', rotation=90)
    plt.title('Total Medals by Country')
    plt.xlabel('Country')
    plt.ylabel('Total Medals')
    plt.show(block=True)

    #Pie chart for medals
    medal_counts = df[['Gold', 'Silver', 'Bronze']].sum()
    plt.figure(figsize=(8, 8))
    plt.pie(medal_counts, labels=medal_counts.index, autopct='%1.1f%%', colors=['gold', 'silver', '#cd7f32'])
    plt.title('Total Medal Counts by Type')
    plt.show(block=True)

    #Correlation Analysis
    correlation_matrix = df[['Gold', 'Silver', 'Bronze', 'Total']].corr()
    plt.figure(figsize=(5, 4))
    sns.heatmap(correlation_matrix, annot=True,cmap='coolwarm',fmt='.2f')
    plt.show(block=True)

    plt.pause(0.5)
    return 

class AnalysisEngine:
    def __init__(self, df):
        self.df = df

    def kmeans_clustering(self, n_clusters):
        X = self.df[['Gold', 'Silver', 'Bronze','Total']]
        kmeans = KMeans(n_clusters=3, random_state=0)  
        self.df['Cluster'] = kmeans.fit_predict(X)
        return kmeans

    def pca_analysis(self):
        pca = PCA(n_components=2)
        components = pca.fit_transform(self.df[['Gold', 'Silver', 'Bronze', 'Total']])
        self.df['PCA1'] = components[:, 0]
        self.df['PCA2'] = components[:, 1]
        return pca
        
    def decision_tree_classification(self):
        self.df['Gold_Category'] = pd.cut(
            self.df['Gold'], 
            bins=[-1, 3, 14, float('inf')],  
            labels=['Low', 'Medium', 'High'] 
        )
        encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
        self.df['Gold_Category_Encoded'] = encoder.fit_transform(self.df[['Gold_Category']])

        X = self.df[['Gold', 'Silver', 'Bronze', 'Total']]
        y = self.df['Gold_Category_Encoded']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        decision_tree = DecisionTreeClassifier(random_state=0)
        decision_tree.fit(X_train, y_train)
        return (decision_tree,X_test,y_test)

    def random_forest_classification(self):
        self.df['Gold_Category'] = pd.cut(
            self.df['Gold'], 
            bins=[-1, 3, 14, float('inf')],  
            labels=['Low', 'Medium', 'High'] 
        )
        encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
        self.df['Gold_Category_Encoded'] = encoder.fit_transform(self.df[['Gold_Category']])

        X = self.df[['Gold', 'Silver', 'Bronze', 'Total']]
        y = self.df['Gold_Category_Encoded']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        random_forest = RandomForestClassifier(random_state=0)
        random_forest.fit(X_train, y_train)
        return (random_forest,X_test,y_test)

def cli_interface():
    print("Welcome to the AI Employee")
    file_path = input("Please enter the path to your dataset (CSV, JSON, Excel): ")
    processor = DataIngestion(file_path)
    processor.clean_data()
    df = processor.get_dataframe()

    generate_summaries(df)
    generate_visualizations(df)
    
    engine = AnalysisEngine(df)
    model_type = input("Choose analysis: [1] K-Means Clustering [2] PCA [3] Decision Tree Classification [4]RandomForest Classification ")
        
    if model_type == '1':
        n_clusters = int(input("Enter the number of clusters: "))
        model = engine.kmeans_clustering(n_clusters)
        print(f"K-Means Clustering: Centroids={model.cluster_centers_}")
        # Visualize the clusters
        plt.scatter(df['Gold'], df['Total'], c=df['Cluster'])
        plt.xlabel('Gold Medals')
        plt.ylabel('Total Medals')
        plt.title('Clustering of Countries by Medal Counts')
        plt.show()
        
    elif model_type == '2':
        pca = engine.pca_analysis()
        print(f"PCA Components: {pca.components_}")

    elif model_type == "3":
        dtc,X_test,y_test = engine.decision_tree_classification()
        # Predict on the test set
        y_pred_tree = dtc.predict(X_test)
        
        # Evaluate the Decision Tree Classifier
        print("Decision Tree Classifier:")
        print("Accuracy:", accuracy_score(y_test, y_pred_tree))
        print(classification_report(y_test, y_pred_tree, target_names=['Low', 'Medium', 'High']))

    elif model_type == "4":
        rfc,X_test,y_test = engine.random_forest_classification()
        # Predict on the test set
        y_pred_forest = rfc.predict(X_test)
        
        # Evaluate the Random Forest Classifier
        print("Random Forest Classifier:")
        print("Accuracy:", accuracy_score(y_test, y_pred_forest))
        print(classification_report(y_test, y_pred_forest, target_names=['Low', 'Medium', 'High']))
    else:
        print("Invalid choice. Exiting.")
        return
        
cli_interface()

Welcome to the AI Employee


Please enter the path to your dataset (CSV, JSON, Excel):  w


ValueError: Unsupported file format: 

In [3]:
df = pd.read_csv("olympics2024.csv")

In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")

def process_query(query):
    doc = nlp(query.lower())
    
    if "gold" in query:
        if "most" in query:
        
            result = df.loc[df['Gold'].idxmax(), 'Country']
            return f"The country with the most gold medals is {result}."
        else:
            # Handle other gold medal queries
            pass
    
    elif "total" in query and "medal" in query:
        # Query for total medals of a specific country
        country = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
        if country:
            result = df.loc[df['Country'] == country[0], 'Total'].values
            if len(result) > 0:
                return f"The total number of medals for {country[0]} is {result[0]}."
            else:
                return f"No data found for {country[0]}."
    
    elif "silver" in query:
        # Handle silver medal distribution
        result = df[['Country', 'Silver']].sort_values(by='Silver', ascending=False)
        return result.to_string(index=False)

    # Add more query processing as needed

    return "Sorry, I didn't understand your query."
def main():
    parser = argparse.ArgumentParser(description="Olympic Data Analysis CLI")
    parser.add_argument('query', type=str, help="Your query about the Olympic dataset.")
    args = parser.parse_args()

    response = process_query(args.query)
    print(response)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'spacy'

ModuleNotFoundError: No module named 'spacy'

In [65]:
!pip list


Package                 Version
----------------------- --------
anaconda-anon-usage     0.4.4
annotated-types         0.7.0
archspec                0.2.3
blis                    0.7.11
boltons                 23.0.0
Brotli                  1.0.9
catalogue               2.0.10
certifi                 2024.7.4
cffi                    1.16.0
charset-normalizer      3.3.2
click                   8.1.7
cloudpathlib            0.19.0
colorama                0.4.6
conda                   24.7.1
conda-content-trust     0.2.0
conda-libmamba-solver   24.7.0
conda-package-handling  2.3.0
conda_package_streaming 0.10.0
confection              0.1.5
cryptography            42.0.5
cymem                   2.0.8
distro                  1.9.0
en-core-web-sm          3.7.1
et-xmlfile              1.1.0
frozendict              2.4.2
idna                    3.7
Jinja2                  3.1.4
joblib                  1.4.2
jsonpatch               1.33
jsonpointer             2.1
langcodes               3.4.