<a href="https://colab.research.google.com/github/a2hvin/carti/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
solar_flare = fetch_ucirepo(id=89)

# data (as pandas dataframes)
X = solar_flare.data.features
y = solar_flare.data.targets

# metadata
print(solar_flare.metadata)

# variable information
print(solar_flare.variables)


In [None]:
import pandas as pd
from ucimlrepo import fetch_ucirepo


solar_flare = fetch_ucirepo(id=89)
X = solar_flare.data.features
y = solar_flare.data.targets

# (lowercase only)
X.columns = [col.lower() for col in X.columns]
y.columns = [col.lower() for col in y.columns]

# Replace missing or invalid values (e.g., '?' strings) with NaN
X = X.replace('?', pd.NA)
y = y.replace('?', pd.NA)

#  Remove any rows with missing values
df = pd.concat([X, y], axis=1)
df = df.dropna()

# cleaned: lowercase columns, no missing values
print(df.head())


In [None]:
print(df.columns)
df['moderate flares'] = pd.to_numeric(df['moderate flares'], errors='coerce')
df['severe flares'] = pd.to_numeric(df['severe flares'], errors='coerce')


df = df.dropna(subset=['moderate flares', 'severe flares'])

df['total_flares'] = df['moderate flares'] + df['severe flares']

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

solar_flare = fetch_ucirepo(id=89)
X = solar_flare.data.features
y = solar_flare.data.targets

df = pd.concat([X, y], axis=1)

df.columns = [col.lower() for col in df.columns]

df = df.replace('?', pd.NA)


df['moderate flares'] = pd.to_numeric(df['moderate flares'], errors='coerce')
df['severe flares'] = pd.to_numeric(df['severe flares'], errors='coerce')


df = df.dropna(subset=['evolution', 'moderate flares', 'severe flares'])


df['total_flares'] = df['moderate flares'] + df['severe flares']



growth_map = {
    1: 'Decay',
    2: 'No Growth',
    3: 'Growth'
}
df['evolution'] = df['evolution'].astype(int).map(growth_map)

# Plot 1
sns.set()
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='evolution', palette='pastel')
plt.title('Sunspot Counts based on Evolution Type')
plt.xlabel('Evolution Type')
plt.ylabel('Count')
plt.show()

# Plot 2
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='evolution', y='severe flares', palette='muted')
plt.title('Severe (X-Class) Flare Distribution by Evolution Type')
plt.xlabel('Evolution Type')
plt.ylabel('Severe Flares')
plt.show()

# Plot 3
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='evolution', y='total_flares', palette='muted')
plt.title('Total Flare Distribution by Evolution Type')
plt.xlabel('Evolution Type')
plt.ylabel('Total Flares')
plt.show()


In [None]:
print("Null Hypothesis:")
print("The mean total flare activity is the same across all sunspot evolution types.")

print("Alternative Hypothesis:")
print("At least one sunspot evolution type has a different mean total flare activity.")




In [None]:
import pandas as pd
import scipy.stats as stats
from ucimlrepo import fetch_ucirepo

# Get the data
solar_flare = fetch_ucirepo(id=89)
X = solar_flare.data.features
y = solar_flare.data.targets

# Combine features and targets
df = pd.concat([X, y], axis=1)

# Make flare columns numbers (in case they're strings)
df['moderate flares'] = pd.to_numeric(df['moderate flares'], errors='coerce')
df['severe flares'] = pd.to_numeric(df['severe flares'], errors='coerce')

# Remove rows with missing data in important columns
df = df.dropna(subset=['evolution', 'moderate flares', 'severe flares'])

# Add total flares column
df['total_flares'] = df['moderate flares'] + df['severe flares']

# Count how many data points per evolution type
counts = df.groupby('evolution')['total_flares'].count()
print("Data points per evolution type:")
print(counts)

# Make groups only if they have at least 2 data points
groups = []
for evolution_type in sorted(df['evolution'].unique()):
    group_data = df[df['evolution'] == evolution_type]['total_flares']
    if len(group_data) > 1:
        groups.append(group_data)

if len(groups) >= 2:
    f_stat, p_val = stats.f_oneway(*groups)
    print(f"\nANOVA F-statistic: {f_stat:.3f}")
    print(f"p-value: {p_val:.3f}")

    if p_val < 0.05:
        print("Reject null hypothesis: There is a difference between groups.")
    else:
        print("Fail to reject null hypothesis: No difference found between groups.")
else:
    print("Not enough groups with enough data to run ANOVA.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd


grouped = df.groupby('evolution')['total_flares']
means = grouped.mean()
errors = grouped.sem()

colors = {
    'Growth': '#66c2a5',
    'No Growth': '#fc8d62',
    'Decay': '#8da0cb'
}

labels = list(colors.keys())
mean_vals = [means[label] for label in labels]
error_vals = [errors[label] for label in labels]
bar_colors = [colors[label] for label in labels]

plt.figure(figsize=(8, 5))
bars = plt.bar(labels, mean_vals, yerr=error_vals, color=bar_colors, capsize=5)


from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=colors[label], label=label) for label in colors]
plt.legend(handles=legend_elements, title='Evolution Type')

plt.title('Average Total Flares by Evolution Type')
plt.ylabel('Mean Total Flares')
plt.xlabel('Evolution Type')
plt.tight_layout()
plt.show()
