##Necessary Imports

In [1]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.28-py3-none-any.whl.metadata (2.9 kB)
Collecting requests<3,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.6.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.28-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [16]:
import numpy as np
import pandas as pd
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

##Load Dataset

In [4]:
#File paths
mapping_file = '/content/Parameters for Reviews slash Feedback Classification.xlsx'
nps_reviews_file = '/content/NPS reviews on Tab 2023.xlsx'
google_reviews_file = '/content/Google reviews data 2023.xlsx'

#Load DataFrames
mapping_df = pd.read_excel(mapping_file)
nps_df = pd.read_excel(nps_reviews_file)
google_df = pd.read_excel(google_reviews_file)

#Display
print("Mapping File (Categories)")
print(mapping_df.head(), "\n")

print("NPS Reviews")
print(nps_df.head(), "\n")

print("Google Reviews")
print(google_df.head(), "\n")


Mapping File (Categories)
  COMPLAINT TYPE COMPLAINT TYPE.1     MEDIUM RESPONSIBLE
0           Shop    Wrong Article   Facebook  E-Commerce
1         Online  Missing Article  Instagram  Operations
2            NaN  Damaged Article       Call          IT
3            NaN    Fitting Issue      Email   Warehouse
4            NaN    Quality Issue        VOC         NaN 

NPS Reviews
                  Date  Rating  Store Id  \
0  2023-12-31 00:00:00    6.25    4001.0   
1  2023-12-31 00:00:00    5.00    4001.0   
2  2023-12-31 00:00:00    6.00    4001.0   
3  2023-12-31 00:00:00    6.25    4035.0   
4  2023-12-30 00:00:00    5.75    1128.0   

                                  Remarks          Name       Contact  
0                                    fine    Simran Naz  3.046542e+09  
1                                     noo        munaza  3.025506e+09  
2  well done happy new year do sale offer  Fahad sheikh  3.218488e+09  
3                                   good   Ayesha khald  3.217313

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


##Cleaning

In [5]:
#Cleaning Function

def clean_text(text):

    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        text = ' '.join(text.split())
        return text
    return text

#Clean NPS
for col in nps_df.columns:
    if nps_df[col].dtype == 'object':
        nps_df[col] = nps_df[col].apply(clean_text)


#Clean Google Reviews
for col in google_df.columns:
    if google_df[col].dtype == 'object':
        google_df[col] = google_df[col].apply(clean_text)


#Clean Mapping File
for col in mapping_df.columns:
    mapping_df[col] = mapping_df[col].apply(clean_text)

print("Cleaned NPS Reviews")
print(nps_df.head())

print("\nCleaned Google Reviews")
print(google_df.head())

print("\nCleaned Mapping File")
print(mapping_df.head())


Cleaned NPS Reviews
                  Date  Rating  Store Id  \
0  2023-12-31 00:00:00    6.25    4001.0   
1  2023-12-31 00:00:00    5.00    4001.0   
2  2023-12-31 00:00:00    6.00    4001.0   
3  2023-12-31 00:00:00    6.25    4035.0   
4  2023-12-30 00:00:00    5.75    1128.0   

                                  Remarks          Name       Contact  
0                                    fine    simran naz  3.046542e+09  
1                                     noo        munaza  3.025506e+09  
2  well done happy new year do sale offer  fahad sheikh  3.218488e+09  
3                                    good  ayesha khald  3.217313e+09  
4                               excellent        rawish  3.349101e+09  

Cleaned Google Reviews
                  Date  Store Code               City  Rating  \
0  2023-12-30 00:00:00      1041.0  wapda town lahore     5.0   
1  2023-12-29 00:00:00      1145.0      stylo mm alam     4.0   
2  2023-12-29 00:00:00      1201.0         rawalpindi     5.0   

In [6]:
#Test
sample_texts = [
    "   Hello!!! World??   ",
    "This    has   extra   spaces",
    "1234 test *** TEXT/   ",
    "Well Done, Happy New Year!!!",
    "Great   Service!!! #1 :)"
]

for text in sample_texts:
    print(f"Original: '{text}'  -->  Cleaned: '{clean_text(text)}'")


Original: '   Hello!!! World??   '  -->  Cleaned: 'hello world'
Original: 'This    has   extra   spaces'  -->  Cleaned: 'this has extra spaces'
Original: '1234 test *** TEXT/   '  -->  Cleaned: 'test text'
Original: 'Well Done, Happy New Year!!!'  -->  Cleaned: 'well done happy new year'
Original: 'Great   Service!!! #1 :)'  -->  Cleaned: 'great service'


##Inspection

In [7]:
print("Missing values in NPS Reviews:")
print(nps_df.isnull().sum())

print("\nMissing values in Google Reviews:")
print(google_df.isnull().sum())

print("\nMissing values in Mapping File:")
print(mapping_df.isnull().sum())


Missing values in NPS Reviews:
Date         1
Rating       2
Store Id     2
Remarks      3
Name        11
Contact     17
dtype: int64

Missing values in Google Reviews:
Date               1
Store Code         2
City               2
Rating             2
Comments          10
Review Reply     192
Reply Date       192
Customer Name      2
dtype: int64

Missing values in Mapping File:
COMPLAINT TYPE      10
COMPLAINT TYPE.1     0
MEDIUM               0
RESPONSIBLE          8
dtype: int64


##Handling Missing Values

In [8]:
#Mapping
cols_to_fill = ['COMPLAINT TYPE', 'RESPONSIBLE']

for col in cols_to_fill:
    if col in mapping_df.columns:
        mapping_df[col] = mapping_df[col].ffill().fillna('unknown')

print(mapping_df[cols_to_fill].isnull().sum())
print(mapping_df[cols_to_fill].head(10))


COMPLAINT TYPE    0
RESPONSIBLE       0
dtype: int64
  COMPLAINT TYPE RESPONSIBLE
0           shop   ecommerce
1         online  operations
2         online          it
3         online   warehouse
4         online   warehouse
5         online   warehouse
6         online   warehouse
7         online   warehouse
8         online   warehouse
9         online   warehouse


In [9]:
#NPS
fill_values = {
    'Date': nps_df['Date'].ffill(),
    'Rating': 0,
    'Store Id': nps_df['Store Id'].ffill(),
    'Remarks': 'unknown',
    'Name': 'anonymous',
    'Contact': 'unknown'
}


nps_df = nps_df.fillna(value=fill_values)

print("Missing values after filling:")
print(nps_df.isnull().sum())
print(nps_df.head(10))


Missing values after filling:
Date        0
Rating      0
Store Id    0
Remarks     0
Name        0
Contact     0
dtype: int64
                  Date  Rating  Store Id  \
0  2023-12-31 00:00:00   6.250    4001.0   
1  2023-12-31 00:00:00   5.000    4001.0   
2  2023-12-31 00:00:00   6.000    4001.0   
3  2023-12-31 00:00:00   6.250    4035.0   
4  2023-12-30 00:00:00   5.750    1128.0   
5  2023-12-30 00:00:00   6.125    1128.0   
6  2023-12-30 00:00:00   6.125    1128.0   
7  2023-12-30 00:00:00   5.625    1128.0   
8  2023-12-30 00:00:00   5.500    1128.0   
9  2023-12-30 00:00:00   5.375    1128.0   

                                  Remarks              Name         Contact  
0                                    fine        simran naz    3046541646.0  
1                                     noo            munaza    3025505936.0  
2  well done happy new year do sale offer      fahad sheikh    3218488500.0  
3                                    good      ayesha khald    3217313212.0 

In [10]:
#Google Reviews dataset
fill_rules = {
    'Date': 'ffill',
    'Store Code': 'ffill',
    'City': 'ffill',
    'Rating': 'ffill',
    'Comments': 'unknown',
    'Review Reply': 'no reply',
    'Reply Date': 'unknown',
    'Customer Name': 'unknown'
}

for col, method in fill_rules.items():
    if col in google_df.columns:
        if method == 'ffill':
            google_df[col] = google_df[col].ffill()
        else:
            google_df[col] = google_df[col].fillna(method)


print("Missing values after cleaning:")
print(google_df.isnull().sum())

print(google_df.head(10))


Missing values after cleaning:
Date             0
Store Code       0
City             0
Rating           0
Comments         0
Review Reply     0
Reply Date       0
Customer Name    0
dtype: int64
                  Date  Store Code               City  Rating  \
0  2023-12-30 00:00:00      1041.0  wapda town lahore     5.0   
1  2023-12-29 00:00:00      1145.0      stylo mm alam     4.0   
2  2023-12-29 00:00:00      1201.0         rawalpindi     5.0   
3  2023-12-28 00:00:00      1235.0             lahore     3.0   
4  2023-12-28 00:00:00      1022.0            sahiwal     4.0   
5  2023-12-28 00:00:00      4005.0          hyderabad     5.0   
6  2023-12-27 00:00:00      1145.0      stylo mm alam     3.0   
7  2023-12-27 00:00:00      1145.0      stylo mm alam     5.0   
8  2023-12-26 00:00:00      4019.0            karachi     1.0   
9  2023-12-26 00:00:00      1222.0            karachi     1.0   

                                            Comments  \
0                             ve

##Text Splitter

In [11]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=30)

documents = []

#Google Reviews
for _, row in google_df.iterrows():
    if row['Comments'] != "Unknown":
        chunks = splitter.split_text(str(row['Comments']))
        for chunk in chunks:
            documents.append({
                "text": chunk,
                "metadata": {
                    "source": "Google Review",
                    "date": str(row['Date']),
                    "store": str(row['Store Code']),
                    "city": row['City'],
                    "rating": row['Rating'],
                    "customer": row['Customer Name']
                }
            })


#NPS Reviews
for _, row in nps_df.iterrows():
    if row['Remarks'] != "Unknown":
        chunks = splitter.split_text(str(row['Remarks']))
        for chunk in chunks:
            documents.append({
                "text": chunk,
                "metadata": {
                    "source": "NPS Review",
                    "date": str(row['Date']),
                    "store": str(row['Store Id']),
                    "rating": row['Rating'],
                    "customer": row['Name']
                }
            })


#Mapping
for _, row in mapping_df.iterrows():
    text_to_embed = " | ".join([str(row[col]) for col in mapping_df.columns])
    chunks = splitter.split_text(text_to_embed)
    for chunk in chunks:
        documents.append({
            "text": chunk,
            "metadata": {
                "source": "Mapping File",
                "category": str(row.get("COMPLAINT TYPE", "unknown")),
                "responsible": str(row.get("RESPONSIBLE", "unknown"))
            }
        })

print(f"Total Chunks Created from all 3 files: {len(documents)}")


Total Chunks Created from all 3 files: 990


##Vector Store

In [12]:
#Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

texts = [doc['text'] for doc in documents]
metadatas = [doc['metadata'] for doc in documents]

#FAISS vector store
vector_store = FAISS.from_texts(texts, embedding_model, metadatas=metadatas)

print("FAISS vector store created successfully!")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS vector store created successfully!


##Model

In [15]:
# LLM: google/flan-t5-large
text2text_pipeline = pipeline("text2text-generation", model="google/flan-t5-large", max_length=100)
llm = HuggingFacePipeline(pipeline=text2text_pipeline)

# Sentiment Analysis
sentiment_pipeline = pipeline("sentiment-analysis")


Device set to use cpu
  llm = HuggingFacePipeline(pipeline=text2text_pipeline)
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


##Augmentation

In [17]:
prompt_template = """
Classify the following customer feedback into the most relevant two categories from this list:
{categories}

Feedback: "{comment}"

Return only the two categories as a comma-separated list.
"""

category_prompt = PromptTemplate(
    input_variables=["categories", "comment"],
    template=prompt_template
)

category_chain = LLMChain(llm=llm, prompt=category_prompt)

  category_chain = LLMChain(llm=llm, prompt=category_prompt)


##Retrival

In [19]:
vector_store = FAISS.from_texts(texts, embedding_model, metadatas=metadatas)
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

def get_similar_categories(comment: str) -> list[str]:

    results = retriever.get_relevant_documents(comment)
    return [doc.metadata.get("category", "") for doc in results if doc.metadata.get("source") == "Mapping File"]


In [20]:
def classify_feedback(comment: str) -> dict:

#Retrieval
    similar_categories = get_similar_categories(comment)
    categories_list = ", ".join(similar_categories)

#Predict
    llm_output = category_chain.run(categories=categories_list, comment=comment)
    predicted_categories = [cat.strip() for cat in llm_output.split(",")]

#Map categories to departments
    departments = mapping_df[mapping_df["COMPLAINT TYPE"].isin(predicted_categories)]["RESPONSIBLE"].unique().tolist()

#Sentiment Analysis
    sentiment_result = sentiment_pipeline(comment)[0]["label"]


    return {
        "comment": comment,
        "categories": predicted_categories,
        "departments": departments,
        "sentiment": sentiment_result
    }


In [22]:
test = "The staff was very helpful but the checkout process was slow."
result = classify_feedback(test)
print(result)

{'comment': 'The staff was very helpful but the checkout process was slow.', 'categories': ['Customer service'], 'departments': [], 'sentiment': 'NEGATIVE'}


##Flask

In [27]:
pip install flask-cors

Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Installing collected packages: flask-cors
Successfully installed flask-cors-6.0.1


In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import logging
from typing import List, Dict, Any


try:
#Prefer importing the user's existing pipeline if available
    from feedback_pipeline import classify_feedback, get_similar_categories, mapping_df
    _IMPORTED_PIPELINE = True
except Exception:

    classify_feedback = None
    get_similar_categories = None
    mapping_df = None
    _IMPORTED_PIPELINE = False

app = Flask(__name__)
CORS(app)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@app.route("/health", methods=["GET"])
def health_check() -> Any:
    """Health check endpoint.

    Returns basic status information about the service and whether the
    user's processing pipeline was successfully imported.
    """
    return jsonify({
        "status": "ok",
        "pipeline_loaded": _IMPORTED_PIPELINE
    })


@app.route("/classify", methods=["POST"])
def classify_endpoint() -> Any:
    """Classify a single customer comment.

    Expects JSON: { "comment": "..." }
    Returns JSON with the same structure produced by `classify_feedback`.
    """
    if not _IMPORTED_PIPELINE:
        return jsonify({"error": "Processing pipeline not found. Make sure feedback_pipeline.py is available."}), 500

    payload = request.get_json(force=True)
    if not payload or "comment" not in payload:
        return jsonify({"error": "Request JSON must include a 'comment' field."}), 400

    comment = payload["comment"]
    try:
        result = classify_feedback(comment)
        return jsonify(result)
    except Exception as e:
        logger.exception("Error classifying comment")
        return jsonify({"error": str(e)}), 500


@app.route("/classify/bulk", methods=["POST"])
def classify_bulk_endpoint() -> Any:
    """Classify multiple comments at once.

    Expects JSON: { "comments": ["comment1", "comment2", ...] }
    Returns: { "results": [ {comment, categories, departments, sentiment}, ... ] }
    """
    if not _IMPORTED_PIPELINE:
        return jsonify({"error": "Processing pipeline not found. Make sure feedback_pipeline.py is available."}), 500

    payload = request.get_json(force=True)
    comments = payload.get("comments") if payload else None
    if not comments or not isinstance(comments, list):
        return jsonify({"error": "Request JSON must include a 'comments' list."}), 400

    results = []
    for c in comments:
        try:
            results.append(classify_feedback(c))
        except Exception as e:
            logger.exception("Error classifying a comment in bulk")
            results.append({"comment": c, "error": str(e)})

    return jsonify({"results": results})


@app.route("/similar", methods=["POST"])
def similar_endpoint() -> Any:
    """Return top similar categories (from mapping file) for a comment.

    Expects JSON: { "comment": "...", "top_k": 5 }
    Returns: { "similar_categories": [ ... ] }
    """
    if not _IMPORTED_PIPELINE:
        return jsonify({"error": "Processing pipeline not found. Make sure feedback_pipeline.py is available."}), 500

    payload = request.get_json(force=True)
    comment = payload.get("comment") if payload else None
    if not comment:
        return jsonify({"error": "Request JSON must include a 'comment' field."}), 400

    try:
        categories = get_similar_categories(comment)
        return jsonify({"similar_categories": categories})
    except Exception as e:
        logger.exception("Error retrieving similar categories")
        return jsonify({"error": str(e)}), 500


@app.route("/mapping/sample", methods=["GET"])
def mapping_sample() -> Any:
    """Return a small sample of the loaded mapping DataFrame for debugging.

    Useful to verify that your mapping file was successfully loaded by the
    underlying pipeline. If mapping_df is not available it returns an error.
    """
    if mapping_df is None:
        return jsonify({"error": "mapping_df not available. Ensure your pipeline exposes mapping_df."}), 500

#Return first 20 rows as records (JSON serializable)
    sample = mapping_df.head(20).to_dict(orient="records")
    return jsonify({"mapping_sample": sample})


if __name__ == "__main__":
    # Development server only.
    app.run(host="0.0.0.0", port=5000, debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:
pip install flask-cors