# Preprocessing Web Service Descriptions

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pandas as pd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
    return " ".join(lemmatized_tokens)


if __name__ == "__main__":
    for n in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
        input_csv = f"../data/top_web_services_categories_output/Top_{n}_Web_Services_Categories.csv"
        df = pd.read_csv(input_csv)

        # Apply preprocessing
        print(df)
        df['Pre-Processed Description'] = df['Service Description'].apply(preprocess_text)

        # Save the result
        output_csv = f"Pre_Processed_Top_{n}_Web_Services_Categories.csv"
        df.to_csv(output_csv, encoding='utf-8', index=False, header=True)
        print(df)

        print(f"Preprocessing done! Processed file saved to: {output_csv}")


                                    Service Description Service Classification
0     Google Maps' services have been split into mul...                Mapping
1     It has been split into multiple APIs, includin...                 Social
2     Its functions have been split among the follow...                 Social
3     What was formerly the ECSeCommerce Servicehas ...              eCommerce
4     Twilio provides a simple hosted API and markup...              Telephony
...                                                 ...                    ...
4697  Use the FindMeOn API to manage your identities...                 Social
4698  The FreedomSpeaks.com site is the first non-pa...             Government
4699  PhoneGnome is a hardware-to-internet service t...              Telephony
4700  Use the Dojo Learning API to access lessons, u...             Enterprise
4701  This API offers delayed stock quotes for equit...              Financial

[4702 rows x 2 columns]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91956\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91956\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91956\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\91956\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


                                    Service Description  \
0     Google Maps' services have been split into mul...   
1     It has been split into multiple APIs, includin...   
2     Its functions have been split among the follow...   
3     What was formerly the ECSeCommerce Servicehas ...   
4     Twilio provides a simple hosted API and markup...   
...                                                 ...   
4697  Use the FindMeOn API to manage your identities...   
4698  The FreedomSpeaks.com site is the first non-pa...   
4699  PhoneGnome is a hardware-to-internet service t...   
4700  Use the Dojo Learning API to access lessons, u...   
4701  This API offers delayed stock quotes for equit...   

     Service Classification                          Pre-Processed Description  
0                   Mapping  googl map servic split multipl api includ stat...  
1                    Social  split multipl api includ twitter ad api twitte...  
2                    Social  function split amon

                                     Service Description  \
0      Google Maps' services have been split into mul...   
1      It has been split into multiple APIs, includin...   
2      The Data API allows users to integrate their p...   
3      The Flickr API can be used to retrieve photos ...   
4      Its functions have been split among the follow...   
...                                                  ...   
10073  Use the Mollom API to combat spam. Mollom is a...   
10074  Use the MailBoxValidator Email Validation Web ...   
10075  The Weewar API allows you to create notifiers,...   
10076  This API offers delayed stock quotes for equit...   
10077  Use the Protein Data Bank Japan (PDBj) API to ...   

      Service Classification  \
0                    Mapping   
1                     Social   
2                      Video   
3                     Photos   
4                     Social   
...                      ...   
10073               Security   
10074                  

                                     Service Description  \
0      Google Maps' services have been split into mul...   
1      It has been split into multiple APIs, includin...   
2      The Data API allows users to integrate their p...   
3      The Flickr API can be used to retrieve photos ...   
4      Its functions have been split among the follow...   
...                                                  ...   
12049  Use the Mollom API to combat spam. Mollom is a...   
12050  Use the MailBoxValidator Email Validation Web ...   
12051  The Weewar API allows you to create notifiers,...   
12052  This API offers delayed stock quotes for equit...   
12053  Use the Protein Data Bank Japan (PDBj) API to ...   

      Service Classification  \
0                    Mapping   
1                     Social   
2                      Video   
3                     Photos   
4                     Social   
...                      ...   
12049               Security   
12050                  

                                     Service Description  \
0      Google Maps' services have been split into mul...   
1      It has been split into multiple APIs, includin...   
2      The Data API allows users to integrate their p...   
3      The Flickr API can be used to retrieve photos ...   
4      Its functions have been split among the follow...   
...                                                  ...   
13085  Use the Mollom API to combat spam. Mollom is a...   
13086  Use the MailBoxValidator Email Validation Web ...   
13087  The Weewar API allows you to create notifiers,...   
13088  This API offers delayed stock quotes for equit...   
13089  Use the Protein Data Bank Japan (PDBj) API to ...   

      Service Classification  \
0                    Mapping   
1                     Social   
2                      Video   
3                     Photos   
4                     Social   
...                      ...   
13085               Security   
13086                  