<a href="https://colab.research.google.com/github/bachvu98/Policy-NLP/blob/master/Preprocessing_Policy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import required dependencies
First, we import the require dependencies

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
segments = pd.read_csv('/content/drive/My Drive/OPP-115/OPP-115/segment_categories.csv',index_col=0)
segments.head()

Unnamed: 0,Policy UID,segment_id,category_name,segments
0,20,0,Other,<strong> Privacy Policy </strong> <br> <br> <s...
1,20,1,Other,This privacy policy does not apply to Sites ma...
2,20,2,Policy Change,"By visiting our Sites, you are accepting the p..."
3,20,3,First Party Collection/Use,<strong> What Information Is Collected? </stro...
4,20,4,First Party Collection/Use,<strong> Personally Identifiable Information <...


In [8]:
categories = list(segments['category_name'].unique())
print(categories)
cols = {'Other': 'other',
        'Policy Change': 'policy_change',
        'First Party Collection/Use': 'first_party_collection_use',
        'Third Party Sharing/Collection': 'third_party_sharing_collection',
        'Do Not Track': 'do_not_track',
        'User Choice/Control': 'user_choice_control',
        'International and Specific Audiences': 'international_specific_audiences',
        'Data Security': 'data_security',
        'Data Retention': 'data_retention',
        'User Access, Edit and Deletion': 'user_access_edit_deletion'}

['Other', 'Policy Change', 'First Party Collection/Use', 'Third Party Sharing/Collection', 'Do Not Track', 'User Choice/Control', 'International and Specific Audiences', 'Data Security', 'Data Retention', 'User Access, Edit and Deletion']


In [9]:
#Loop through the categories and generate a set of new columns with names in cols
binary_categories = pd.DataFrame({'Policy UID':segments['Policy UID'], 'segment_id':segments['segment_id'], 'segments':segments['segments']})

for category in categories:
    one_hot = lambda s: 1 if s.startswith(category) else 0
    binary_categories[cols[category]] = segments['category_name'].apply(one_hot)

In [11]:
binary_categories.head()

Unnamed: 0,Policy UID,segment_id,segments,other,policy_change,first_party_collection_use,third_party_sharing_collection,do_not_track,user_choice_control,international_specific_audiences,data_security,data_retention,user_access_edit_deletion
0,20,0,<strong> Privacy Policy </strong> <br> <br> <s...,1,0,0,0,0,0,0,0,0,0
1,20,1,This privacy policy does not apply to Sites ma...,1,0,0,0,0,0,0,0,0,0
2,20,2,"By visiting our Sites, you are accepting the p...",0,1,0,0,0,0,0,0,0,0
3,20,3,<strong> What Information Is Collected? </stro...,0,0,1,0,0,0,0,0,0,0
4,20,4,<strong> Personally Identifiable Information <...,0,0,1,0,0,0,0,0,0,0


# Preprocessing segments text

In [3]:
def clean_text(text):
  text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", text).split())
  #Then tokenisation
  tokens = word_tokenize(text)
  # convert to lower case
  tokens = [w.lower() for w in tokens]
  # remove punctuation from each word
  table = str.maketrans('', '', string.punctuation)
  stripped = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  words = [word for word in stripped if word.isalpha()]
  # filter out stop words
  stop_words = set(stopwords.words('english'))
  # You can add more stop words here, specific for texts
  words = [w for w in words if not w in stop_words]
  # stemming of words
  lemmatizer = WordNetLemmatizer()
  words = [lemmatizer.lemmatize(word) for word in words]
  # Convert from list to a sentence again
  text = ' '.join(word for word in words)
  return text

In [None]:
#Process the segments here
segments['segments'] = segments['segments'].apply(clean_text)
segments.head()

Unnamed: 0,Policy UID,segment_id,category_name,segments
0,20,0,Other,strong privacy policy strong br br strong effe...
1,20,1,Other,privacy policy apply site maintained company o...
2,20,2,Policy Change,visiting site accepting practice described pri...
3,20,3,First Party Collection/Use,strong information collected strong br br coll...
4,20,4,First Party Collection/Use,strong personally identifiable information str...


In [None]:
segments.iloc[5]['segments']

'strong nonpersonally identifiable information strong br br interact site may collect certain information identify individually nonpersonally identifiable information nonpii demographic data information computer mobile device device use access site including ip address general location information unique device identifier browser type browser language transactional information may also collect information use site including search term search result additional traffic data time access date access software crash report session identification number access time referring website address server may automatically keep activity log use site addition nonidentifiable individual information may collect aggregate data regarding use site br br'

In [None]:
segments.to_csv("/content/drive/My Drive/OPP-115/OPP-115/segment_processed.csv")

In [12]:
binary_categories['segments'] = binary_categories['segments'].apply(clean_text)
binary_categories.head()

Unnamed: 0,Policy UID,segment_id,segments,other,policy_change,first_party_collection_use,third_party_sharing_collection,do_not_track,user_choice_control,international_specific_audiences,data_security,data_retention,user_access_edit_deletion
0,20,0,strong privacy policy strong br br strong effe...,1,0,0,0,0,0,0,0,0,0
1,20,1,privacy policy apply site maintained company o...,1,0,0,0,0,0,0,0,0,0
2,20,2,visiting site accepting practice described pri...,0,1,0,0,0,0,0,0,0,0
3,20,3,strong information collected strong br br coll...,0,0,1,0,0,0,0,0,0,0
4,20,4,strong personally identifiable information str...,0,0,1,0,0,0,0,0,0,0


In [13]:
binary_categories.to_csv("/content/drive/My Drive/OPP-115/OPP-115/binary_segment_categories.csv")