# **Installing OpenAI**

In [None]:
pip install openai==0.28



# **Qualitative Approach**

In [None]:
import openai
import os

openai.api_key = "*********************************************"

def generate_checklist(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        temperature=0.5,
    )
    return response.choices[0].message["content"].strip()

prompt = """
Create a detailed checklist to avoid data shift mistakes in machine learning projects.
The checklist should include steps for data collection, preprocessing, model training, validation, and deployment.
"""

checklist = generate_checklist(prompt)
print(checklist)


Here is a detailed checklist to avoid data shift mistakes in machine learning projects:

1. **Data Collection:**
   - Define the data requirements and objectives of the project.
   - Identify relevant data sources and ensure data quality.
   - Collect a representative sample of the data to ensure diversity.
   - Keep track of metadata, such as timestamps and data sources.

2. **Data Preprocessing:**
   - Clean the data by handling missing values, outliers, and duplicates.
   - Normalize or standardize the data to ensure consistency.
   - Perform feature engineering to extract relevant information.
   - Split the data into training, validation, and test sets.

3. **Model Training:**
   - Choose appropriate algorithms based on the problem and data.
   - Train the model on the training set using cross-validation techniques.
   - Monitor and log hyperparameters, model performance, and training metrics.
   - Regularly update and retrain the model as new data becomes available.

4. **Validat

In [None]:
def generate_checklist(workflow):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an expert assistant."},
            {"role": "user", "content": workflow}
        ],
        max_tokens=500,
        temperature=0.5,
    )
    return response['choices'][0]['message']['content']

workflow = """
You are an expert assistant tasked with creating a detailed checklist to avoid data shift mistakes in machine learning projects.
The checklist should be comprehensive and divided into the following sections: data collection, preprocessing, model training, validation, and deployment.
For each section, provide specific steps, recommendations, and best practices. Here is the structure:

1. Data Collection:
   - Outline steps to ensure data consistency.
   - Detail how to monitor changes in data sources.
   - Describe the importance of version control for datasets.

2. Data Preprocessing:
   - Explain the preprocessing steps that should be consistently applied.
   - Discuss the importance of scaling and normalizing data.
   - Highlight methods to check for missing or anomalous values.

3. Model Training:
   - Describe the use of cross-validation to assess model performance.
   - Provide guidelines on training models with representative data samples.
   - Emphasize the need for regular retraining with updated data.

4. Validation:
   - Explain how to use a holdout validation set.
   - Provide steps to continuously monitor model performance metrics.
   - Discuss the implementation of statistical tests to detect data drift.

5. Deployment:
   - Detail the deployment of monitoring tools to track data and model performance.
   - Explain how to set up alerts for significant deviations in data patterns.
   - Recommend regular updates to models based on new data insights.

Ensure that each section is clear and actionable.
"""

# Step 6: Generate the checklist
checklist = generate_checklist(workflow)
print(checklist)


### Checklist to Avoid Data Shift Mistakes in Machine Learning Projects

#### 1. Data Collection:
- **Ensure Data Consistency:**
  - Define clear data collection protocols and standards.
  - Regularly check for duplicates, inconsistencies, and missing values.
- **Monitor Changes in Data Sources:**
  - Set up automated monitoring systems for data sources.
  - Track metadata changes and updates to understand the impact on the data.
- **Importance of Version Control:**
  - Use version control systems (e.g., Git) to track changes in datasets.
  - Document data transformations and maintain a clear record of dataset versions.

#### 2. Data Preprocessing:
- **Consistent Preprocessing Steps:**
  - Standardize data cleaning processes across all datasets.
  - Document preprocessing steps to ensure reproducibility.
- **Scaling and Normalizing Data:**
  - Scale numerical features to ensure uniformity in model training.
  - Normalize data to bring different features to a similar scale.
- **Check fo

In [None]:
def generate_data_shift_insights(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a data shift detection specialist."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000,
        temperature=0.5,
    )
    return response['choices'][0]['message']['content']


In [None]:
workflow = """
You are a data shift detection specialist. Your task is to create a detailed checklist and provide insights to detect data shift in machine learning projects. The checklist should cover the following areas: data collection, data preprocessing, model training, model validation, and model deployment. For each area, provide specific steps, recommendations, best practices, and methods to detect data shift. Include examples and tools where applicable.

1. Data Collection:
   - Steps to ensure data consistency.
   - Methods to monitor changes in data sources.
   - Importance and implementation of version control for datasets.

2. Data Preprocessing:
   - Preprocessing steps that should be consistently applied.
   - Techniques for scaling and normalizing data.
   - Methods to check for missing or anomalous values.
   - Tools and techniques for detecting changes in data distribution.

3. Model Training:
   - Use of cross-validation to assess model performance.
   - Guidelines on training models with representative data samples.
   - Regular retraining strategies with updated data.
   - Techniques for detecting data shift during training.

4. Model Validation:
   - Use of a holdout validation set and its importance.
   - Continuous monitoring of model performance metrics.
   - Statistical tests to detect data drift.
   - Tools and techniques for validation and monitoring.

5. Model Deployment:
   - Deployment of monitoring tools to track data and model performance.
   - Setting up alerts for significant deviations in data patterns.
   - Regular updates to models based on new data insights.
   - Strategies for handling detected data shifts in production.

Provide detailed steps and explanations for each point, along with examples and recommended tools.
"""


In [None]:
insights = generate_data_shift_insights(workflow)
print(insights)


1. Data Collection:
   - Steps to ensure data consistency:
     - Define clear data collection processes and protocols.
     - Regularly check for missing or incomplete data.
     - Implement data quality checks to identify inconsistencies.
   - Methods to monitor changes in data sources:
     - Set up automated monitoring systems to track data sources.
     - Compare new data with historical data to detect deviations.
     - Use data profiling tools to analyze data distributions.
   - Importance and implementation of version control for datasets:
     - Use version control systems (e.g., Git) to track changes in datasets.
     - Document dataset versions and changes made during preprocessing.
     - Ensure reproducibility by maintaining a record of dataset versions.

2. Data Preprocessing:
   - Preprocessing steps that should be consistently applied:
     - Cleaning data by handling missing values and outliers.
     - Encoding categorical variables and standardizing data formats.
    

# **Quantitative Approach**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import ks_2samp

import os

def read_csv(file_path):
    return pd.read_csv(file_path)

def analyze_data_shift(df):

    train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

    shift_report = {}

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):

            stat, p_value = ks_2samp(train_df[column], test_df[column])
            shift_report[column] = {
                "statistic": stat,
                "p_value": p_value,
                "shift_detected": p_value < 0.05
            }
        else:

            train_dist = train_df[column].value_counts(normalize=True)
            test_dist = test_df[column].value_counts(normalize=True)
            all_categories = set(train_dist.index).union(set(test_dist.index))
            train_dist = train_dist.reindex(all_categories, fill_value=0)
            test_dist = test_dist.reindex(all_categories, fill_value=0)
            stat = np.sum((train_dist - test_dist)**2 / (train_dist + test_dist + 1e-6))
            shift_report[column] = {
                "statistic": stat,
                "shift_detected": stat > 0.05
            }

    return shift_report

def generate_shift_insights(shift_report):
    insights = "Data Shift Analysis Report:\n\n"
    for column, report in shift_report.items():
        if report["shift_detected"]:
            insights += f"Column '{column}' shows a potential data shift.\n"
            insights += f"  - Statistic: {report['statistic']}\n"
            if "p_value" in report:
                insights += f"  - P-value: {report['p_value']}\n"
            insights += "\n"
        else:
            insights += f"Column '{column}' does not show significant data shift.\n"
            insights += f"  - Statistic: {report['statistic']}\n"
            if "p_value" in report:
                insights += f"  - P-value: {report['p_value']}\n"
            insights += "\n"

    return insights

def get_llm_insights(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a data shift detection specialist."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000,
        temperature=0.5,
    )
    return response['choices'][0]['message']['content']


def analyze_csv_for_data_shift(file_path):

    df = read_csv(file_path)


    shift_report = analyze_data_shift(df)


    shift_insights = generate_shift_insights(shift_report)


    llm_prompt = f"Here is the data shift analysis report:\n\n{shift_insights}\nPlease provide additional insights and recommendations based on this report."
    llm_insights = get_llm_insights(llm_prompt)


    final_insights = f"{shift_insights}\nLLM Insights and Recommendations:\n\n{llm_insights}"
    return final_insights


from google.colab import files


uploaded = files.upload()
file_path = list(uploaded.keys())[0]


insights = analyze_csv_for_data_shift(file_path)
print(insights)


Saving SFPD_stop_cleaned_data.csv to SFPD_stop_cleaned_data.csv


  return pd.read_csv(file_path)


Data Shift Analysis Report:

Column 'doj_record_id' shows a potential data shift.
  - Statistic: 1.6714001356176482

Column 'person_number' does not show significant data shift.
  - Statistic: 0.0024236000901841637
  - P-value: 0.8553208407192856

Column 'agency_ori' does not show significant data shift.
  - Statistic: 0.0

Column 'stop_datetime' shows a potential data shift.
  - Statistic: 1.5040839368408818

Column 'duration_of_stop' does not show significant data shift.
  - Statistic: 0.0025625565273678097
  - P-value: 0.8051680767666806

Column 'is_stop_response_to_call' does not show significant data shift.
  - Statistic: 0.0008416639895324485
  - P-value: 0.9999999999886952

Column 'location' shows a potential data shift.
  - Statistic: 0.561321733344526

Column 'district' does not show significant data shift.
  - Statistic: 9.190702430430465e-05

Column 'city' does not show significant data shift.
  - Statistic: 0.0013663455115759111

Column 'perceived_race_ethnicity' does not s

# **Improving Functions**

In [None]:
def extract_year_from_datetime(df):
    df['year'] = pd.to_datetime(df['stop_datetime']).dt.year
    return df

def analyze_data_shift(df):

    df = extract_year_from_datetime(df)


    train_df = df[df['year'] < 2020]
    test_df = df[df['year'] >= 2020]

    shift_report = []

    for column in df.columns:
        if pd.api.types.is_numeric_dtype(df[column]):

            stat, p_value = ks_2samp(train_df[column], test_df[column])
            shift_report.append([column, stat, p_value, p_value < 0.05])
        else:

            train_dist = train_df[column].value_counts(normalize=True)
            test_dist = test_df[column].value_counts(normalize=True)
            all_categories = set(train_dist.index).union(set(test_dist.index))
            train_dist = train_dist.reindex(all_categories, fill_value=0)
            test_dist = test_dist.reindex(all_categories, fill_value=0)
            stat = np.sum((train_dist - test_dist)**2 / (train_dist + test_dist + 1e-6))
            shift_report.append([column, stat, stat > 0.05])


    shift_report_df = pd.DataFrame(shift_report, columns=['Column', 'Statistic', 'P-value/Shift Detected', 'Shift Detected'])

    return shift_report_df


file_path = "/content/SFPD_stop_cleaned_data.csv"
df = read_csv(file_path)
shift_report = analyze_data_shift(df)
print(shift_report)


  return pd.read_csv(file_path)


                                Column  Statistic P-value/Shift Detected  \
0                        doj_record_id   1.792558                   True   
1                        person_number   0.002184               0.945202   
2                           agency_ori   0.000000                  False   
3                        stop_datetime   1.812836                   True   
4                     duration_of_stop   0.109771                    0.0   
5             is_stop_response_to_call   0.082368                    0.0   
6                             location   1.323937                   True   
7                             district   0.025604                  False   
8                                 city   0.017861                  False   
9             perceived_race_ethnicity   0.003299                  False   
10                    perceived_gender   0.000987                  False   
11                             is_lgbt   0.000118                    1.0   
12          

In [None]:
shift_report

{'doj_record_id': {'statistic': 1.7925581299440647, 'shift_detected': True},
 'person_number': {'statistic': 0.0021844359575853654,
  'p_value': 0.9452016604771285,
  'shift_detected': False},
 'agency_ori': {'statistic': 0.0, 'shift_detected': False},
 'stop_datetime': {'statistic': 1.8128361351349973, 'shift_detected': True},
 'duration_of_stop': {'statistic': 0.10977126857837027,
  'p_value': 0.0,
  'shift_detected': True},
 'is_stop_response_to_call': {'statistic': 0.0823678529583658,
  'p_value': 0.0,
  'shift_detected': True},
 'location': {'statistic': 1.3239370649277609, 'shift_detected': True},
 'district': {'statistic': 0.02560410772798309, 'shift_detected': False},
 'city': {'statistic': 0.01786080768295122, 'shift_detected': False},
 'perceived_race_ethnicity': {'statistic': 0.0032986942868011688,
  'shift_detected': False},
 'perceived_gender': {'statistic': 0.0009873894904830903,
  'shift_detected': False},
 'is_lgbt': {'statistic': 0.00011782226263068463,
  'p_value': 1.