In [1]:
import re
import requests 
import pandas as pd 
from bs4 import BeautifulSoup

# 1. Document collection

In [2]:
url = 'https://www.squ.edu.om/'

In [3]:
# send request to the given url, and parse the html content
def getDocument(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [4]:
soup = getDocument(url)

In [5]:
aTags = soup.find_all('a')
# get all the links form the a tags
rowLinks = [x.get('href') for x in aTags]
# take only the valid links 
links = [x for x in rowLinks if x and 'http' in x and url.split('/')[2] in x]

In [6]:
pages = []
for link in links:
    # get url content
    page = getDocument(link)
    # replace any decimal, non-word character or non-ASCII with space and covert text to lower case
    text = re.sub(r'[^\x00-\x7F]|[\W+\s+\d+]', ' ', page.get_text()).lower()
    pages.append(text)

# 2. Pre-processing

In [8]:
# download nltk stop words 
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\s132962\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\s132962\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
# tokenize the documents 
from nltk.tokenize import word_tokenize
pages = [word_tokenize(doc) for doc in pages]

In [10]:
# remove stop words 
from nltk.corpus import stopwords
for i in range(len(pages)):
    # ignore stop words and single letters
    pages[i] = [word for word in pages[i] if word not in stopwords.words('english') and len(word) > 1]

In [11]:
# apply stemming to the words 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
for i in range(len(pages)):
    pages[i] = [stemmer.stem(word) for word in pages[i]]

# 3. Feature extraction

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([" ".join(page) for page in pages])
# convert to dataframe
df = pd.DataFrame(vectors.toarray(), columns = vectorizer.get_feature_names())

In [24]:
df

Unnamed: 0,aajabri,abd,abdalla,abdul,abdulla,abdullah,abil,abl,aboutabout,abouth,...,zahran,zaidi,zainab,zealandnicaraguanigernigerianiuenorfolk,zeheimi,zone,zonesoman,zoom,zubair,zuhd
0,0.0,0.0,0.04332,0.0,0.0,0.0,0.000000,0.0,0.0,0.014182,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.016261,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.021396,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.015714,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.026240,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,0.0,0.0,0.00000,0.0,0.0,0.0,0.012586,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.015578,0.015578,0.0,0.0,0.0
144,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.039787,0.039787,0.0,0.0,0.0
145,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.026129,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
146,0.0,0.0,0.00000,0.0,0.0,0.0,0.000000,0.0,0.0,0.025608,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


# 4. Data Transformation

In [20]:
from sklearn.random_projection import GaussianRandomProjection
# set the value of p = 500 
projection = GaussianRandomProjection(n_components = 500)
transformed_matrix = projection.fit_transform(vectors)

In [22]:
x = pd.DataFrame(transformed_matrix)

In [23]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-0.006491,0.014505,0.054343,0.092321,-0.007050,0.039258,-0.094057,-0.052908,0.036354,-0.012692,...,-0.041652,-0.120089,0.048923,-0.021094,-0.023775,-0.040113,0.082810,-0.014531,0.061826,0.014517
1,-0.015512,0.028451,0.030429,0.081047,0.005039,-0.084460,-0.019111,0.023934,0.010818,0.007970,...,-0.045025,0.021898,0.025648,-0.021596,0.037842,-0.069637,0.035346,0.002124,-0.008893,-0.038962
2,-0.039423,-0.010163,0.032008,0.138844,0.043416,-0.042662,-0.108613,0.015542,0.036823,0.046707,...,0.031854,-0.063788,0.003966,-0.024903,0.034528,-0.039833,0.085860,-0.052924,0.094560,-0.004308
3,0.014474,0.000665,-0.014526,0.035388,-0.007189,-0.020941,-0.037024,-0.025609,0.054584,0.017250,...,-0.007028,-0.034149,0.083360,-0.019945,0.078348,-0.012097,0.036479,0.004085,0.071766,-0.026501
4,-0.007477,0.025589,0.032353,0.078680,0.037474,-0.051228,-0.060247,0.009833,0.001520,0.022455,...,0.000575,-0.083518,0.037871,-0.040564,0.060877,-0.061432,0.082207,-0.016455,0.056324,0.000700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,0.034880,0.025958,-0.034920,0.043661,-0.012738,0.035582,-0.016499,-0.043026,0.039354,-0.006846,...,0.007747,-0.028772,0.000403,0.007883,0.026581,0.033437,-0.002989,-0.046948,-0.028549,-0.002830
144,0.033345,0.052596,0.008962,-0.009933,-0.036591,0.029937,0.029324,-0.049513,-0.015953,0.022116,...,0.048380,-0.003114,0.011110,-0.060700,0.005222,-0.030280,0.010055,0.023498,0.038691,0.004142
145,0.006476,0.032686,0.075464,0.056867,0.007781,-0.031711,-0.055315,-0.013085,0.022588,-0.034313,...,-0.010214,-0.101305,0.027063,-0.034109,0.045665,-0.061537,0.093917,0.001296,0.052053,-0.009207
146,0.009872,0.039241,0.079018,0.060845,0.011257,-0.047672,-0.049993,-0.027801,0.042623,-0.052937,...,-0.010281,-0.099315,0.032771,-0.038476,0.039488,-0.075308,0.091601,-0.018896,0.073924,-0.005246


# 5. Data Clustering