<a href="https://colab.research.google.com/github/awagler2/NODE/blob/main/CIPanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


In [None]:
# Sample dataset
data = {
    "CIP Code": ["14.0901", "14.0702", "09.0102", "14.1001", "09.0701"],
    "Description": [
        "Computer Engineering, General",
        "Chemical Engineering",
        "Mass Communication/Media Studies",
        "Electrical and Electronics Engineering",
        "Radio and Television Broadcasting"
    ]
}

df = pd.DataFrame(data)


In [None]:
# External data (employment and salary)
external_data = {
    "CIP Code": ["14.0901", "14.0702", "09.0102", "14.1001", "09.0701"],
    "Median Salary": [85000, 75000, 45000, 80000, 50000],  # Median salary in USD
    "Employment Growth (%)": [12.0, 10.0, 8.0, 9.5, 7.0]  # Employment growth rate
}

external_df = pd.DataFrame(external_data)


In [None]:
# Merge external data with the main dataset
df_merged = pd.merge(df, external_df, on="CIP Code", how="left")


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select columns to normalize
scaler = MinMaxScaler()
df_merged[['Median Salary', 'Employment Growth (%)']] = scaler.fit_transform(
    df_merged[['Median Salary', 'Employment Growth (%)']]
)


In [None]:
# Extract levels from CIP codes
df['Level_1'] = df['CIP Code'].str[:2]  # Broad category (2-digit)
df['Level_2'] = df['CIP Code'].str[:5]  # Subfield (4-digit)
df['Level_3'] = df['CIP Code']          # Full code (6-digit)


In [None]:
# One-hot encode the levels
encoder = OneHotEncoder()
level_1_encoded = encoder.fit_transform(df[['Level_1']])
level_2_encoded = encoder.fit_transform(df[['Level_2']])

# Convert encoded arrays to DataFrame for clarity
level_1_df = pd.DataFrame(level_1_encoded, columns=encoder.get_feature_names_out(['Level_1']))
level_2_df = pd.DataFrame(level_2_encoded, columns=encoder.get_feature_names_out(['Level_2']))

# Merge encoded features back into the original DataFrame
df_encoded = pd.concat([df, level_1_df, level_2_df], axis=1)


ValueError: input_features is not equal to feature_names_in_

In [None]:
# Use TF-IDF to encode text descriptions
tfidf = TfidfVectorizer(max_features=10)  # Limit features for simplicity
text_features = tfidf.fit_transform(df['Description']).toarray()

# Convert to DataFrame
text_features_df = pd.DataFrame(text_features, columns=tfidf.get_feature_names_out())

# Merge text features into the dataset
df_final = pd.concat([df_encoded, text_features_df], axis=1)


NameError: name 'df_encoded' is not defined

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Drop non-numeric and original columns not needed for t-SNE
features = df_final.drop(columns=['CIP Code', 'Description', 'Level_1', 'Level_2', 'Level_3'])

# Run t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(features)

# Add t-SNE results to the DataFrame
df_final['t-SNE_1'] = tsne_results[:, 0]
df_final['t-SNE_2'] = tsne_results[:, 1]

# Plot the results
plt.figure(figsize=(8, 6))
plt.scatter(df_final['t-SNE_1'], df_final['t-SNE_2'], c='blue', s=50)
for i, label in enumerate(df['Description']):
    plt.annotate(label, (df_final['t-SNE_1'][i], df_final['t-SNE_2'][i]))
plt.title("t-SNE Visualization of CIP Codes")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()


In [None]:
# Include the new features in the t-SNE input
features_with_external = df_merged.drop(columns=[
    'CIP Code', 'Description', 'Level_1', 'Level_2', 'Level_3', 't-SNE_1', 't-SNE_2'
])

# Run t-SNE again
tsne = TSNE(n_components=2, random_state=42)
tsne_results_with_external = tsne.fit_transform(features_with_external)

# Add updated t-SNE results to the DataFrame
df_merged['t-SNE_1'] = tsne_results_with_external[:, 0]
df_merged['t-SNE_2'] = tsne_results_with_external[:, 1]


In [None]:
# Plot the results
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
plt.scatter(df_merged['t-SNE_1'], df_merged['t-SNE_2'], c='blue', s=50)

# Annotate with employment and salary info
for i, row in df_merged.iterrows():
    label = f"{row['Description']}\nSalary: ${row['Median Salary']*100000:.0f}\nGrowth: {row['Employment Growth (%)']*100:.1f}%"
    plt.annotate(label, (row['t-SNE_1'], row['t-SNE_2']), fontsize=8)

plt.title("t-SNE Visualization with Employment and Salary Data")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()
