In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('dataset.csv')

# Select relevant columns for prediction
features = ['Data_Value_Type', 'Data_Value_Alt', 'Break_out', 'GeoLocation']
target = 'CategoryID'  # Assuming CategoryID represents the cardiovascular disease presence

# Preprocess the data
le = LabelEncoder()
data['Data_Value_Type'] = le.fit_transform(data['Data_Value_Type'])
data['Break_out'] = le.fit_transform(data['Break_out'])
data['GeoLocation'] = le.fit_transform(data['GeoLocation'])

# Split the data into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


  data = pd.read_csv('dataset.csv')


              precision    recall  f1-score   support

          C1       0.82      0.78      0.80      5615
          C2       0.90      0.92      0.91     11545

    accuracy                           0.87     17160
   macro avg       0.86      0.85      0.85     17160
weighted avg       0.87      0.87      0.87     17160



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('dataset.csv')
column_name ='Confidence_Limit_Low'

data[column_name] = pd.to_numeric(data[column_name], errors='coerce').fillna(0)

# Print the modified column
print(data[column_name])
output_file = 'modified_column.txt'
data[column_name].to_csv(output_file, index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('dataset.csv')

# Replace strings in Confidence_Limit_Low and Confidence_Limit_High with 0
data['Confidence_Limit_Low'] = pd.to_numeric(data['Confidence_Limit_Low'], errors='coerce').fillna(0)
data['Confidence_Limit_High'] = pd.to_numeric(data['Confidence_Limit_High'], errors='coerce').fillna(0)
output_file = 'modified_column.txt'
data['Confidence_Limit_Low'].to_csv(output_file, index=False)
output_file2 = 'modified_column2.txt'
data['Confidence_Limit_High'].to_csv(output_file2, index=False)

# Select relevant columns for prediction
features = ['Data_Value_Alt', 'Confidence_Limit_Low', 'Confidence_Limit_High']
target_range = [9, 11]  # Specify the range for Data_Value_Alt
# target_unstable = [11,]  # Specify the value representing unstable estimates

# # Preprocess the data
# data['Data_Value_Alt'] = data['Data_Value_Alt'].fillna(target_unstable)
# data['Data_Value_Alt'] = pd.to_numeric(data['Data_Value_Alt'], errors='coerce')

# Separate the dataset into two subsets based on the target value
data_within_range = data[(data['Data_Value_Alt'] >= target_range[0]) & (data['Data_Value_Alt'] <= target_range[1])]
data_unstable = data[(data['Data_Value_Alt'] <= target_range[0]) & (data['Data_Value_Alt'] >= target_range[1])]
print((data['Data_Value_Alt'] <= target_range[0]))


# Prepare the data for prediction
X_range = data_within_range[['Confidence_Limit_Low', 'Confidence_Limit_High']]
y_range = data_within_range['Data_Value_Alt']
X_unstable = data_unstable[['Confidence_Limit_Low', 'Confidence_Limit_High']]

print(X_unstable)
# Split the data into training and testing sets for range prediction
X_train_range, X_test_range, y_train_range, y_test_range = train_test_split(X_range, y_range, test_size=0.2, random_state=42)

# Train a linear regression model for range prediction
regressor = LinearRegression()
regressor.fit(X_train_range, y_train_range)

# Make predictions on the test set for range prediction
y_pred_range = regressor.predict(X_test_range)

# Split the data into training and testing sets for unstable prediction
X_train_unstable, X_test_unstable, y_train_unstable, y_test_unstable = train_test_split(X_unstable, data_unstable['Data_Value_Alt'], test_size=0.2, random_state=42)

# Train a Random Forest classifier for unstable prediction
classifier = RandomForestClassifier()
classifier.fit(X_train_unstable, y_train_unstable)

# Make predictions on the test set for unstable prediction
y_pred_unstable = classifier.predict(X_test_unstable)

# Print the number of patients within the range
print("Patients within the range:", len(data_within_range))

# Print the number of unstable estimates
print("Unstable estimates:", len(data_unstable))

# Classification report for unstable predictions
print(classification_report(y_test_unstable, y_pred_unstable))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('dataset.csv')

# Replace strings in Confidence_Limit_Low and Confidence_Limit_High with 0
data['Confidence_Limit_Low'] = pd.to_numeric(data['Confidence_Limit_Low'], errors='coerce').fillna(0)
data['Confidence_Limit_High'] = pd.to_numeric(data['Confidence_Limit_High'], errors='coerce').fillna(0)

# Select relevant columns for prediction
features = ['Data_Value_Alt', 'Confidence_Limit_Low', 'Confidence_Limit_High']
target_range = [7.5, 8.5]  # Specify the range for Data_Value_Alt
target_unstable = '~'  # Specify the value representing unstable estimates

# Preprocess the data
data['Data_Value_Alt'] = data['Data_Value_Alt'].fillna(target_unstable)
data['Data_Value_Alt'] = pd.to_numeric(data['Data_Value_Alt'], errors='coerce')

# Separate the dataset into two subsets based on the target value
data_within_range = data[(data['Data_Value_Alt'] >= target_range[0]) & (data['Data_Value_Alt'] <= target_range[1])]
data_unstable = data[data['Data_Value_Alt'] == target_unstable]

# Prepare the data for prediction
X_range = data_within_range[['Confidence_Limit_Low', 'Confidence_Limit_High']]
y_range = data_within_range['Data_Value_Alt']
X_unstable = data_unstable[['Confidence_Limit_Low', 'Confidence_Limit_High']]

# Check if there are enough samples for range prediction
if len(data_within_range) < 2:
    print("Not enough samples for range prediction.")
    exit()

# Check if there are enough samples for unstable prediction
if len(data_unstable) < 2:
    print("Not enough samples for unstable prediction.")
    exit()

# Split the data into training and testing sets for range prediction
X_train_range, X_test_range, y_train_range, y_test_range = train_test_split(X_range, y_range, test_size=0.2, random_state=42)

# Train a linear regression model for range prediction
regressor = LinearRegression()
regressor.fit(X_train_range, y_train_range)

# Make predictions on the test set for range prediction
y_pred_range = regressor.predict(X_test_range)

# Split the data into training and testing sets for unstable prediction
X_train_unstable, X_test_unstable, y_train_unstable, y_test_unstable = train_test_split(X_unstable, data_unstable['Data_Value_Alt'], test_size=0.2, random_state=42)

# Train a Random Forest classifier for unstable prediction
classifier = RandomForestClassifier()
classifier.fit(X_train_unstable, y_train_unstable)

# Make predictions on the test set for unstable prediction
y_pred_unstable = classifier.predict(X_test_unstable)

# Print the number of patients within the range
print("Patients within the range:", len(data_within_range))

# Print the number of unstable estimates
print("Unstable estimates:", len(data_unstable))

# Classification report for unstable


In [12]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('dataset.csv')

# Replace strings in Confidence_Limit_Low and Confidence_Limit_High with 0
data['Confidence_Limit_Low'] = pd.to_numeric(data['Confidence_Limit_Low'], errors='coerce').fillna(0)
data['Confidence_Limit_High'] = pd.to_numeric(data['Confidence_Limit_High'], errors='coerce').fillna(0)

# Identify category IDs whose Data_Value_Alt is not in the range of confidence levels
target_range = [9, 11]  # Specify the range for Data_Value_Alt

# Filter out rows where Data_Value_Alt is not within the confidence level range
filtered_data = data[(data['Data_Value_Alt'] < target_range[0]) | (data['Data_Value_Alt'] > target_range[1])]

# Get the category IDs of the filtered data
category_ids = filtered_data['CategoryID']

# Create a new DataFrame with the category IDs
result_df = pd.DataFrame({'CategoryID': category_ids})

# Save the result to a new CSV file
result_df.to_csv('ans.csv', index=False)


In [17]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('dataset.csv')

print(f"orginal data length {len(data)}")
# Replace strings in Confidence_Limit_Low and Confidence_Limit_High with 0
data['Confidence_Limit_Low'] = pd.to_numeric(data['Confidence_Limit_Low'], errors='coerce').fillna(0)
data['Confidence_Limit_High'] = pd.to_numeric(data['Confidence_Limit_High'], errors='coerce').fillna(0)

# Identify rows where Data_Value_Alt is not in the range of confidence levels
target_range = [9, 11]  # Specify the range for Data_Value_Alt

# Filter out rows where Data_Value_Alt is within the confidence level range
filtered_data = data[(data['Data_Value_Alt'] < target_range[0]) | (data['Data_Value_Alt'] > target_range[1])]

# Create a new DataFrame with the details and CategoryID
result_df = filtered_data[['Year', 'LocationAbbr', 'LocationDesc', 'Datasource', 'Category', 'Topic', 'Indicator', 'Data_Value_Type',
                           'Data_Value_Unit', 'Data_Value', 'Data_Value_Alt', 'Data_Value_Footnote_Symbol',
                           'Data_Value_Footnote', 'Confidence_Limit_Low', 'Confidence_Limit_High',
                           'Break_Out_Category', 'Break_out', 'CategoryID', 'TopicID', 'IndicatorID',
                           'Data_Value_TypeID', 'BreakoutCategoryID', 'BreakOutID', 'LocationID', 'GeoLocation']]

# Save the result to a new CSV file
result_df.to_csv('ans.csv', index=False)
print(len(result_df))


orginal data length 85800
83097


In [30]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_csv('dataset.csv')
print(f"Length of original data: {len(data)}")
# Replace string values in Confidence_Limit_Low and Confidence_Limit_High with 0
data['Confidence_Limit_Low'] = pd.to_numeric(data['Confidence_Limit_Low'], errors='coerce').fillna(0)
data['Confidence_Limit_High'] = pd.to_numeric(data['Confidence_Limit_High'], errors='coerce').fillna(0)

# Prepare the features and target variable
features = data[['Confidence_Limit_Low', 'Confidence_Limit_High']]
target = ((data['Data_Value_Alt'] >= data['Confidence_Limit_Low']) & (data['Data_Value_Alt'] <= data['Confidence_Limit_High'])).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create and train the Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

target_range=[9,11]
# Filter the dataset based on predictions
# Filter out rows where Data_Value_Alt is within the confidence level range
filtered_data = data[(data['Data_Value_Alt'] < target_range[0]) | (data['Data_Value_Alt'] > target_range[1])]

print(f"Filtered data: {len(filtered_data)}")
# Save the filtered dataset as a new CSV file
filtered_data.to_csv('filtered_dataset.csv', index=False)

import pickle

# Assuming you have a trained model named "model"
with open('model.pkl', 'wb') as file:
    pickle.dump(clf, file)



Length of original data: 85800
Accuracy: 1.0
Filtered data: 83097
