In [None]:
import pandas as pd
import matplotlib.pyplot as plt
dengue_df = pd.read_csv("dengue_data.csv", index_col=0)
meta_df = pd.read_csv("dengue_metadata.csv")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Assuming 'disease.state' is a column in your metadata DataFrame
disease_state_mapping = {
    'Convalescent': 0,
    'Dengue Hemorrhagic Fever': 1,
    'Dengue Fever': 2,
    'healthy control': 3
}

dengue_df = pd.read_csv("dengue_data.csv", index_col=0)
meta_df = pd.read_csv("dengue_metadata.csv")
Transposed_df = dengue_df.T

# Assuming 'meta_df' is your metadata DataFrame
disease_state_colours = [disease_state_mapping[state] for state in meta_df['disease.state']]

# Assuming 'Transposed_df' is your transposed gene expression DataFrame
pca = PCA(n_components=10)
pca.fit(Transposed_df)
explained_var = pca.explained_variance_ratio_
print("Variance explained by each PC:", explained_var)

df_scores = pca.transform(Transposed_df)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(df_scores[:, 0], df_scores[:, 1], c=disease_state_colours, cmap='viridis', alpha=0.5)

# Manually create legend with all disease states and their colors
unique_disease_states = meta_df['disease.state'].unique()
legend_labels = []
for state in unique_disease_states:
    state_label = state
    color = scatter.to_rgba(disease_state_mapping[state])
    legend_labels.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=state_label))

plt.legend(handles=legend_labels, title="Disease States")
plt.xlabel(f'PC1 ({explained_var[0]*100:.2f}% variance)')
plt.ylabel(f'PC2 ({explained_var[1]*100:.2f}% variance)')
plt.title('PCA of Gene Expression Data (PC1 vs PC2)')
plt.colorbar(scatter, label='Disease state')
plt.show()


In [None]:
import pandas as pd
import numpy as np  # Add this import statement
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
from matplotlib.patches import Patch
from sklearn.preprocessing import StandardScaler

# Load the data and metadata
data = pd.read_csv("dengue_data.csv", index_col=0)
meta = pd.read_csv("dengue_metadata.csv")

# Transpose the data so that samples are rows and genes are columns
data_transposed = data.T

# Merge the metadata with the transposed gene expression data
merged_data = pd.merge(meta, data_transposed, left_on='sample', right_index=True)

# Select only numeric data for HCA
numeric_data = merged_data.select_dtypes(include=[np.number])

# Standardize the data
scaler = StandardScaler()
numeric_data_scaled = scaler.fit_transform(numeric_data)

# Perform HCA with the 'ward' linkage method
linked = linkage(numeric_data_scaled, method='ward')

# Define a mapping from disease states to colors
color_mapping = {
    'Convalescent': 'blue',
    'Dengue Hemorrhagic Fever': 'red',
    'Dengue Fever': 'magenta',
    'healthy control': 'green'
}
# Generate the dendrogram with colored branches
fig, ax = plt.subplots(figsize=(12, 8))  # Adjust the size as needed
dendro = dendrogram(linked, 
                    orientation='top',
                    distance_sort='descending',
                    show_leaf_counts=True,
                    no_labels=True)  # No labels to start with
# Add sample names as labels at the tips of the dendrogram
ax.set_xticks([5 + 10 * i for i in range(len(dendro['leaves']))])
ax.set_xticklabels(merged_data['sample'].iloc[dendro['leaves']].to_numpy(), rotation=90, ha='right', fontsize=8)  # Adjust fontsize as needed

# Color the branches according to the disease state of each sample
for xs, ys, idx in zip(dendro['icoord'], dendro['dcoord'], dendro['leaves']):
    color = color_mapping[merged_data.iloc[idx]['disease.state']]
    plt.plot(xs, ys, color)

# Create a legend for the colors with corresponding disease states
legend_handles = [Patch(facecolor=color, label=label) for label, color in color_mapping.items()]
plt.legend(handles=legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0., title='Disease State')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'merged_data' with columns including 'disease.state'
# and gene expression values, and 'Dengue Fever', 'Dengue Hemorrhagic Fever', etc., are the correct labels

# Define the disease states to compare
disease_states = ["Dengue Fever", "Dengue Hemorrhagic Fever", "Convalescent", "healthy control"]  # Add other conditions as needed

# Set the significance level
alpha = 0.05
fold_change_threshold = 0.58  # Adjust as needed

# Create volcano plots for each pairwise comparison
for i, state1 in enumerate(disease_states):
    for state2 in disease_states[i + 1:]:
        # Filter data for the current pair of disease states
        state1_data = merged_data[merged_data["disease.state"] == state1].select_dtypes(include=[np.number])
        state2_data = merged_data[merged_data["disease.state"] == state2].select_dtypes(include=[np.number])

        # Perform statistical analysis (t-test) for the current pair
        t_stat, p_values = stats.ttest_ind(state1_data, state2_data, nan_policy='omit')

        # Calculate fold change for the current pair
        fold_change = np.log2(state1_data.mean() / state2_data.mean())

        # Identify significant genes
        significant_genes_up = state1_data.columns[(p_values < alpha) & (fold_change > fold_change_threshold)]
        significant_genes_down = state1_data.columns[(p_values < alpha) & (fold_change < -fold_change_threshold)]

        # Create the volcano plot for the current pair
        plt.figure(figsize=(12, 10))
        plt.scatter(fold_change, -np.log10(p_values), color='grey', alpha=0.5, label='Non-Significant')
        plt.scatter(fold_change[state1_data.columns.isin(significant_genes_up)], -np.log10(p_values[state1_data.columns.isin(significant_genes_up)]), color="red", label="Upregulated", alpha=0.5)
        plt.scatter(fold_change[state1_data.columns.isin(significant_genes_down)], -np.log10(p_values[state1_data.columns.isin(significant_genes_down)]), color='blue', label="Downregulated", alpha=0.5)

        # Plot log2 fold change threshold lines
        plt.axvline(x=fold_change_threshold, color='green', linestyle='--', label='Log2 Fold Change Threshold')
        plt.axvline(x=-fold_change_threshold, color='green', linestyle='--')

        plt.axhline(y=-np.log10(alpha), color="red", linestyle="--", label=f"Significance Level (p={alpha})")
        plt.xlabel("Log2 fold change")
        plt.ylabel("-Log10(p-value)")
        plt.title(f"Volcano Plot for Differential Gene Expression: {state1} vs {state2}")

        # Annotate significant genes
        for gene in significant_genes_up:
            plt.annotate(gene, (fold_change[state1_data.columns.get_loc(gene)], -np.log10(p_values[state1_data.columns.get_loc(gene)])), textcoords="offset points", xytext=(0,5), ha='center', color='red')
        for gene in significant_genes_down:
            plt.annotate(gene, (fold_change[state1_data.columns.get_loc(gene)], -np.log10(p_values[state1_data.columns.get_loc(gene)])), textcoords="offset points", xytext=(0,5), ha='center', color='blue')

        plt.legend()
        plt.grid()
        plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your merged dataset containing gene expression data and metadata
merged_df = pd.merge(meta_df, dengue_df.T, left_on='sample', right_index=True)

# Define features (gene expressions) and target variable
X = merged_df.drop(['sample', 'disease.state'], axis=1)
# Exclude non-numeric columns
X = X.select_dtypes(include='number')
y = (merged_df['disease.state'] == 'Dengue Fever').astype(int)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy}')
print('Classification Report:\n', classification_report_output)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your merged dataset containing gene expression data and metadata
merged_df = pd.merge(meta_df, dengue_df.T, left_on='sample', right_index=True)

# Define features (gene expressions) and target variable for DF vs. DHF comparison
X = merged_df.drop(['sample', 'disease.state'], axis=1)
# Exclude non-numeric columns
X = X.select_dtypes(include='number')
y = (merged_df['disease.state'].isin(['Dengue Fever', 'Dengue Hemorrhagic Fever'])).astype(int)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

# Print the results
print(f'Accuracy: {accuracy}')
print('Classification Report:\n', classification_report_output)
