In [7]:
import pandas as pd # pandas is used for data manipulation
import numpy as np
from sklearn.model_selection  import train_test_split # using sklearn library to build an train the ML model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk #nltkis for text Preprocessing
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

# work on NLP = word2vec instead!

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
df=pd.read_excel('data.xlsx')
df.tail(10)

Unnamed: 0,Feedback,sentiment
153,The work environment is completely toxic.,negative
154,Toxic managers make every day unbearable,negative
155,The toxic culture here is destroying morale,negative
156,I left because the toxic behavior was never ad...,negative
157,"This team is toxic, full of favoritism and gossip",negative
158,Toxic leadership drives people to quit constantly,negative
159,The toxic atmosphere affects both work and hea...,negative
160,Toxic coworkers make collaboration impossible,negative
161,The company ignores how toxic the workplace ha...,negative
162,"It’s a toxic cycle of overwork, stress, and no...",negative


In [9]:
df.shape

(163, 2)

In [10]:
df=df.dropna()

In [11]:
df.shape

(163, 2)

In [12]:
print(df.isna().sum())

Feedback     0
sentiment    0
dtype: int64


In [13]:
print("Duplicates :", df.duplicated().sum())
df=df.drop_duplicates()

Duplicates : 0


In [14]:
df.shape

(163, 2)

In [15]:
print(df['sentiment'].value_counts())

sentiment
negative    95
positive    68
Name: count, dtype: int64


In [16]:
#remove stop words:a , an , the , at , by...
stop_words = set(stopwords.words('english')) #" this is the variable that stores stopwords"
#clean the text more

def clean_text(text):
  # Convert non-string values to empty strings
  if not isinstance(text, str):
      text = ''
  text=re.sub('[^a-zA-Z]',' ',text) # remove punctuation/numbers
  text=text.lower() # transform the text into lower cases
  text=text.split()
  text=[word for word in text if not word in stop_words]
  text=' '.join(text)
  return text
df["Cleaned_Feedback"]=df["Feedback"].apply(clean_text)
df.head(23)

Unnamed: 0,Feedback,sentiment,Cleaned_Feedback
0,love,positive,love
1,I love it here,positive,love
2,I love the work here,positive,love work
3,"good about this company, provides virtual roles",positive,good company provides virtual roles
4,Less visibility for next level and promotions ...,negative,less visibility next level promotions point ti...
5,People from IIMs and ISBs only have the opport...,positive,people iims isbs opportunity grow people local...
6,"Good pay, 6 pagers, focus on prioritisation an...",positive,good pay pagers focus prioritisation working b...
7,Management layer has lot of redundancies.,negative,management layer lot redundancies
8,Logistics supply management application operat...,negative,logistics supply management application operat...
9,Toxic culture (but that will depend on your te...,negative,toxic culture depend team hierarchical questio...


**Preprocessing the text**

In [17]:
df.shape

(163, 3)

**Convert Text to vectors:**

In [18]:
vectorizer = TfidfVectorizer(max_features=500) # The goal is to extract features from
#The words; so that each word will have  its own represntative vector in a specific space
# Check for and remove rows with missing values in the 'sentiment' column
df.dropna(subset=['sentiment'], inplace=True)

x=vectorizer.fit_transform(df["Cleaned_Feedback"]).toarray()
y=df["sentiment"]# Labels of the data
print(y)

0      positive
1      positive
2      positive
3      positive
4      negative
         ...   
158    negative
159    negative
160    negative
161    negative
162    negative
Name: sentiment, Length: 163, dtype: object


**Splitting the data for training and testing:**

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42, stratify=y, shuffle=True)

In [20]:
print(y_train.value_counts())
print(y_test.value_counts())

sentiment
negative    66
positive    48
Name: count, dtype: int64
sentiment
negative    29
positive    20
Name: count, dtype: int64


In [21]:
y_test.head()

Unnamed: 0,sentiment
5,positive
160,negative
11,positive
85,negative
116,negative


In [22]:
y_train.head()

Unnamed: 0,sentiment
151,negative
115,negative
40,negative
138,negative
129,negative


**Train the ML model (logistic regression for classification) as the output of
 the model will be 0/1 negative feedback or positive feedback / if I want to improve the model into building sth that genertaes more than just a binary outcome we can use the ANN**

In [23]:
model= LogisticRegression()
model.fit(x_train, y_train) #" training the model on the labeled data set "

**Model evaluation**

In [24]:
y_pred= model.predict(x_test)
print("classification report: \n", classification_report(y_test, y_pred))
print("confusion matrix: \n", confusion_matrix(y_test, y_pred)) #" comparing the actual labelsof the testing set and the predictedlabels by the model "
print("accuracy score: \n", accuracy_score(y_test, y_pred))

classification report: 
               precision    recall  f1-score   support

    negative       0.63      1.00      0.77        29
    positive       1.00      0.15      0.26        20

    accuracy                           0.65        49
   macro avg       0.82      0.57      0.52        49
weighted avg       0.78      0.65      0.56        49

confusion matrix: 
 [[29  0]
 [17  3]]
accuracy score: 
 0.6530612244897959


In [25]:
#Using the built model to make new predictions
new_feedback=[
    "I hate working here ",
    "I love this environment",
    "Great environment",
]
new_feedback_cleaned=[clean_text(text) for text in new_feedback]
new_feedback_vectorized=vectorizer.transform(new_feedback_cleaned).toarray()
predictions=model.predict(new_feedback_vectorized)
print(predictions)

['negative' 'positive' 'negative']


In [26]:
from sklearn.metrics import roc_auc_score
y_pred_proba = model.predict_proba(x_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC Score:", auc_score)

AUC Score: 0.8568965517241379


**Developping the App!**

In [27]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m112.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0


In [28]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.0


In [29]:
!pip install joblib



**Saving the model and vectorizer**

In [30]:
import joblib
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [31]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import re
import nltk
from nltk.corpus import stopwords
import plotly.express as px # Import plotly.express
import seaborn as sns
from wordcloud import WordCloud
# Download stopwords
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
  # Convert non-string values to empty strings
  if not isinstance(text, str):
      text = ''
  text=re.sub('[^a-zA-Z]',' ',text) # remove punctuation/numbers
  text=text.lower() # transform the text into lower cases
  text=text.split()
  text=[word for word in text if not word in stop_words]
  text=' '.join(text)
  return text


# Load your trained model and vectorizer
model = joblib.load("sentiment_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Set a session state variable to control which page is displayed
if 'page' not in st.session_state:
    st.session_state.page = 'Home Page'

#-------------------HOME PAGE VIEW--------------------------------------------------------
if st.session_state.page == 'Home Page':
    st.image('logo.png', caption="L'Art Rue Logo")
    st.markdown("<h1 style='text-align: center; color:#F466E6'> Monitoring & Evaluation Data Tools </h1>", unsafe_allow_html=True)

    col1, col2, col3 =st.columns([1,2,1])
    with col2:
      if st.button("Tool 01 :  Feedback Sentiment Analysis"):
          st.session_state.page = 'Sentiment Analysis'
      elif st.button("Tool 02 :  Funnel Charts Generation"):
          st.session_state.page = 'Funnel Chart Generation'
      elif st.button("Tool 03 :  Migration Pattern Analysis"):
          st.session_state.page = 'Migration Pattern Analysis'
      elif st.button("Tool 04 : Word Cloud Generator"):
          st.session_state.page = 'Word Cloud Generator'

#--------- ONCE THE BOTTOM OF SENTIMENT ANAYSIS IS CLICKED!!!!!!-------------------------------
elif st.session_state.page == 'Sentiment Analysis':
    st.image('logo.png', caption="L'Art Rue Logo")
    st.markdown("<h1 style='text-align: center; color:#ff7923'> Monitoring & Evaluation Tool 01</h1>", unsafe_allow_html=True)
    st.markdown("<h2 style='text-align: center; color: #F466E6'>Feedback Sentiment Dashboard</h2>", unsafe_allow_html=True)
    st.markdown("Please upload an Excel file with a column named 'Feedback' to perform sentiment analysis.")

    uploaded_file = st.file_uploader("(Please ensure the text is translated in English )", type=["xlsx"])

    if uploaded_file:
        df = pd.read_excel(uploaded_file)

        if "Feedback" not in df.columns:
            st.error("excel must have a column named 'Feedback'")
        else:
            if st.button('Perform Sentiment Analysis'):
                df["Feedback_cleaned"]= df["Feedback"].apply(clean_text)
                X = vectorizer.transform(df["Feedback_cleaned"])
                predictions = model.predict(X)
                df["sentiment"] = predictions

                # Calculate positive/negative %
                counts = df["sentiment"].value_counts(normalize=True) * 100

                # Define colors for the pie chart
                colors = ['#ff7923', '#F72CC3'] # You can change these colors

                # Create columns for side-by-side layout
                col1, col2 = st.columns(2)

                with col1:
                   st.subheader("Sentiment Distribution")
                   # Display pie chart
                   fig, ax = plt.subplots(figsize=(4, 5)) # You can adjust the figure size here
                   counts.plot.pie(autopct='%1.1f%%', ax=ax, colors=colors)
                   ax.legend(labels=counts.index, loc="best") # Add legend
                   st.pyplot(fig)

                with col2:
                  st.subheader("Predicted Feedback Sentiments")
                  # Optional: show table with predictions
                  st.dataframe(df.drop(columns=["Feedback_cleaned"]))

    if st.button("Return to Home Page"):
        st.session_state.page = 'Home Page'
#---------------------------------ONCE FUNNEL CHART BUTTOM IS CLICKED!!!!!!!-----------------------------------------------------
elif st.session_state.page == 'Funnel Chart Generation':
    st.image('logo.png', caption="L'Art Rue Logo")
    st.markdown("<h1 style= 'text-align:center ; color:#ff7923'> Monitoring & Evaluation Tool 02 </h1>", unsafe_allow_html=True)
    st.markdown("<h2 style= 'text-align:center ; color:#F466E6'> Funnel Charts Generation </h2>", unsafe_allow_html=True)
    st.markdown("Please upload an Excel file with the dymographic data of your candidates (Region of Origin & Residence)")

    uploaded_file = st.file_uploader("(Please ensure that the excel file uploaded respects the form agreed upon!  )", type=["xlsx"])
    if uploaded_file:
        df = pd.read_excel(uploaded_file)
        if "Region of Origin" not in df.columns or "Region of Residence" not in df.columns:
            st.error("Excel must have columns named 'Region of Origin' and 'Region of Residence'")
        else:
          #region of residence funnel chart
          grand_tunis_regions=["Grand Tunis", 'grand tunis']# here i can detail it with all regions in grand tunis
          df['region of residence category']=df['Region of Residence'].apply(lambda x: 'Grand Tunis' if x in grand_tunis_regions else 'Outside Grand Tunis')
          total_by_region=df.groupby('region of residence category').size().reset_index(name='Total Applicants')
          preselected_counts=df[df['Preselected?']=='Yes'].groupby('region of residence category').size().reset_index(name='Preselected')
          selected_counts=df[df['Selected?']=='Yes'].groupby('region of residence category').size().reset_index(name='Selected')
          merged_counts=total_by_region.merge(preselected_counts, on='region of residence category', how='outer').merge(selected_counts, on='region of residence category', how='outer')
          merged_counts=merged_counts.fillna(0)
          funnel_data_stacked=merged_counts.melt(id_vars=['region of residence category'], var_name='Stage', value_name='Count')
          fig01=px.funnel(funnel_data_stacked, x='Count', y='Stage', color='region of residence category', title='Funnel Chart For Candidates distribution through stages/ Region of Residence')
          st.plotly_chart(fig01)
          #region of origin funnel chart
          df['region of origin category']=df['Region of Origin'].apply(lambda x: 'Grand Tunis' if x in grand_tunis_regions else 'Outside Grand Tunis')
          total_by_region=df.groupby('region of origin category').size().reset_index(name='Total Applicants')
          preselected_counts=df[df['Preselected?']=='Yes'].groupby('region of origin category').size().reset_index(name='Preselected')
          selected_counts=df[df['Selected?']=='Yes'].groupby('region of origin category').size().reset_index(name='Selected')
          merged_counts=total_by_region.merge(preselected_counts, on='region of origin category', how='outer').merge(selected_counts, on='region of origin category', how='outer')
          merged_counts=merged_counts.fillna(0)
          funnel_data_stacked=merged_counts.melt(id_vars=['region of origin category'], var_name='Stage', value_name='Count')
          fig02=px.funnel(funnel_data_stacked, x='Count', y='Stage', color='region of origin category', title='Funnel Chart For Candidates distribution through stages/ Region of Origin')
          st.plotly_chart(fig02)

    if st.button("Return to Home Page"):
        st.session_state.page = 'Home Page'
#Migration pattern analysis--------------Heat Map Section-------------------------------------------
elif st.session_state.page == 'Migration Pattern Analysis':
    st.image('logo.png', caption="L'Art Rue Logo")
    st.markdown("<h1 style= 'text-align:center ; color:#ff7923'> Monitoring & Evaluation Tool 03 </h1>", unsafe_allow_html=True)
    st.markdown("<h2 style= 'text-align:center ; color:#F466E6'> Migration Pattern Analysis </h2>", unsafe_allow_html=True)
    st.markdown("Please upload an Excel file with the dymographic data of your candidates (Region of Origin & Residence)")
    uploaded_file = st.file_uploader("(Please ensure that the excel file uploaded respects the form agreed upon!  )", type=["xlsx"])
    if uploaded_file:
        df = pd.read_excel(uploaded_file)
        if "Region of Origin" not in df.columns or "Region of Residence" not in df.columns:
            st.error("Excel must have columns named 'Region of Origin' and 'Region of Residence'")
        else:
          df["region of origin category"]=df["Region of Origin"].apply(lambda x: "Grand Tunis" if x in ["Grand Tunis", 'grand tunis'] else('Not specified' if x=='Not specified' else 'Other than Grand Tunis'))
          df["region of residence category"]=df["Region of Residence"].apply(lambda x: "Grand Tunis" if x in ["Grand Tunis", 'grand tunis'] else('Not specified' if x=='Not specified' else 'Other than Grand Tunis'))
          #heat map number 01
          st.markdown("<h3 style='text-align:center'> Heat map with the 'Not specified' values </h3>",unsafe_allow_html=True)
          focused_migration_patterns=df.groupby(['region of origin category', 'region of residence category']).size().reset_index(name='Count')
          pivot_table_A=focused_migration_patterns.pivot_table(index='region of origin category', columns='region of residence category', values='Count', fill_value=0)
          fig,ax=plt.subplots(figsize=(10,6))
          sns.heatmap(pivot_table_A, annot=True, cmap='Oranges', fmt='g', ax=ax)
          st.pyplot(fig)
          #heat map number 02
          filtered_focused_migration_patterns=focused_migration_patterns[(focused_migration_patterns['region of origin category']!='Not specified') & (focused_migration_patterns['region of residence category']!='Not specified')]
          pivot_table_B=filtered_focused_migration_patterns.pivot_table(index='region of origin category', columns='region of residence category', values='Count', fill_value=0)
          st.markdown("<h3 style='text-align:center'> Heat map without the 'Not specified' values </h3>",unsafe_allow_html=True)
          fig,ax=plt.subplots(figsize=(10,6))
          sns.heatmap(pivot_table_B, annot=True, cmap='PuRd', fmt='g', ax=ax)
          st.pyplot(fig)
    if st.button("Return to Home Page"):
        st.session_state.page = 'Home Page'
#Word Cloud Genertator-------------------------------------------------Word Cloud Section------------------------------------------
elif st.session_state.page == 'Word Cloud Generator':
    st.image('logo.png', caption="L'Art Rue Logo")
    st.markdown("<h1 style= 'text-align:center ; color:#ff7923'> Monitoring & Evaluation Tool 04 </h1>", unsafe_allow_html=True)
    st.markdown("<h2 style= 'text-align:center ; color:#F466E6'> Word Cloud Generator </h2>", unsafe_allow_html=True)
    st.markdown("Please upload an Excel file with a column of the words you want to turn into a word cloud (please title it 'Words')")
    uploaded_file = st.file_uploader("(Please ensure the words are  translated in English )", type=["xlsx"])
    if uploaded_file:
        df = pd.read_excel(uploaded_file)
        if "Words" not in df.columns:
            st.error("Excel must have a column named 'Words'")
        else:
          filtered_data=df[df['Words'].isna()]
          filtered_data=df[df['Words'] !='Not specified']
          word_counts= filtered_data['Words'].value_counts().to_dict()
          wordcloud = WordCloud(width=800, height=400, background_color='white',colormap='Oranges').generate_from_frequencies(word_counts)
          fig=plt.figure(figsize=(12,12))
          plt.imshow(wordcloud, interpolation='bilinear')
          plt.axis('off')
          st.pyplot(fig)
    if st.button("Return to Home Page"):
        st.session_state.page = 'Home Page'



Writing app.py


In [32]:
from google.colab import userdata
import os

# Load the ngrok authtoken from Colab secrets and set it as an environment variable
os.environ["NGROK_AUTH_TOKEN"] = userdata.get("NGROK_AUTH_TOKEN")

In [34]:
from pyngrok import ngrok
import os
from google.colab import userdata

# Kill previous tunnels
ngrok.kill()

# Set the ngrok authtoken
NGROK_AUTH_TOKEN = userdata.get("NGROK_AUTH_TOKEN")
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Start Streamlit in background
get_ipython().system_raw('streamlit run app.py &')

# Expose public URL
url = ngrok.connect(addr='8501')
print(f'Tunnel URL: {url}')

Tunnel URL: NgrokTunnel: "https://770a5cdfe23b.ngrok-free.app" -> "http://localhost:8501"
