In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import RegexpParser
text = """A newspaper is the strongest medium for news. People are reading newspapers for decades. 
It has a huge contribution to globalization. Right now because of easy internet connection, 
people don't read printed newspapers often. They read the online version."""
print("Sample text: \n", text, "\n")
sent_tokenized = sent_tokenize(text)
print("Tokenizing by sentence: \n", sent_tokenized, "\n")
word_tokenized = word_tokenize(text)
print("Tokenizing by word: \n", word_tokenized, "\n")
stop_words = set(stopwords.words('english'))
punctuation_set = set(punctuation)
print("After filtering the stop words and punctuation: ")
filtered_words = [word for word in word_tokenized if word.casefold() not in stop_words and word.casefold() not in punctuation_set]
for word in filtered_words:
    print(word)
ps = PorterStemmer()
words = ["reading", "globalization", "Being", "Went", "gone", "going"]
print("\nGiven words: ", words)
stemm = [ps.stem(i) for i in words]
print("After stemming: ", stemm, "\n")
lem = WordNetLemmatizer()
print("rocks:", lem.lemmatize("rocks"))
print("corpora:", lem.lemmatize("corpora"))
print("better:", lem.lemmatize("better"))
print("believes:", lem.lemmatize("believes"), "\n")
print("went as adjective:", lem.lemmatize("went", pos="a"))
print("went as verb:", lem.lemmatize("went", pos="v"))
print("went as noun:", lem.lemmatize("went", pos="n"), "\n")
postag = nltk.pos_tag(word_tokenized)
print("POS tagging: \n")
for i in postag:
    print(i)
print("\n")
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunker = RegexpParser(grammar)
output = chunker.parse(postag)
print("After Chunking:\n", output)
output.pretty_print()

[nltk_data] Downloading package punkt to /home/cc2/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/cc2/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/cc2/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/cc2/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/cc2/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/cc2/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /home/cc2/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Sample text: 
 A newspaper is the strongest medium for news. People are reading newspapers for decades. 
It has a huge contribution to globalization. Right now because of easy internet connection, 
people don't read printed newspapers often. They read the online version. 

Tokenizing by sentence: 
 ['A newspaper is the strongest medium for news.', 'People are reading newspapers for decades.', 'It has a huge contribution to globalization.', "Right now because of easy internet connection, \npeople don't read printed newspapers often.", 'They read the online version.'] 

Tokenizing by word: 
 ['A', 'newspaper', 'is', 'the', 'strongest', 'medium', 'for', 'news', '.', 'People', 'are', 'reading', 'newspapers', 'for', 'decades', '.', 'It', 'has', 'a', 'huge', 'contribution', 'to', 'globalization', '.', 'Right', 'now', 'because', 'of', 'easy', 'internet', 'connection', ',', 'people', 'do', "n't", 'read', 'printed', 'newspapers', 'often', '.', 'They', 'read', 'the', 'online', 'version', '.'] 



In [2]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn import metrics
cancer = datasets.load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3, random_state=109)
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("Actual values:", y_test)
print("Predicted values:", y_pred)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))

Actual values: [1 1 0 0 1 0 1 1 1 0 0 0 1 0 1 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 1
 0 1 1 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 1 1
 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1
 0 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 1 0
 0 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1]
Predicted values: [1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 1
 0 1 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 1 1
 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 1 0 1 1
 0 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 1 0 0 1 0
 0 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1]
Accuracy: 0.9649122807017544
Precision: 0.9811320754716981
Recall: 0.9629629629629629


In [4]:
# Imports

import requests

import numpy as np

import pandas as pd

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

import re

import os

%matplotlib inline

riturl="https://purdue.edu"

webpage requests.get(riturl)

ritsoup BeautifulSoup(webpage.content, "html.parser")

print(ritsoup)

print("Title of the parsed page: ",ritsoup.title)

print()

print("All the links: ")

links [link.get('href') for link in ritsoup.find_all('a')]

print(links,"\n")

print("Accessing elements of the parsed page: ")

print("Accessing heading element: ",ritsoup.h1)

print(ritsoup.head)

print(ritsoup.head.meta)

paragraphs ritsoup.find all("p")

print()

print("Get all <p> elements: ".paragraphs)
print()

print("Gets all the p elements with a class attribute with value hide: ",ritsoup find_all("p", attrs ("class": "hide")))

print()

print("Obtaining strings: ",ritsoup.hl.string).

print()

#The contents method is similar but always returns a list:

print("Obtaining strings using contents method: ", ritsoup.hl.contents)

print()

#If the element contains any tags, then string will return None

print(paragraphs[2]."\n")

print(paragraphs[2].string."\n")

#However, contents will return a list as before, mixing different kinds of elements:

para2 paragraphs[2].contents

print(para2,"\n")

para7-paragraphs[5].contents

print(para7,"\n")

#You can also use stripped_strings, which is a generator over all the strings (tags

#removed)

#inside the element; this is a fast way to extract the raw texts, with all tag soup strained

#off:

for s in paragraphs[3].stripped_strings: print("="*50) print(s)

SyntaxError: invalid syntax (3598658853.py, line 21)

In [5]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import re
import os

%matplotlib inline

# URL to scrape
riturl = "https://purdue.edu"

# Make a GET request to fetch the raw HTML content
webpage = requests.get(riturl)

# Parse the HTML content
ritsoup = BeautifulSoup(webpage.content, "html.parser")

# Print the parsed HTML
print(ritsoup)

# Print the title of the parsed page
print("Title of the parsed page: ", ritsoup.title)

print()
print("All the links: ")

# Extract all links
links = [link.get('href') for link in ritsoup.find_all('a')]
print(links, "\n")

print("Accessing elements of the parsed page: ")

# Access the first heading element
print("Accessing heading element: ", ritsoup.h1)

print(ritsoup.head)
print(ritsoup.head.meta)

# Find all paragraph elements
paragraphs = ritsoup.find_all("p")

print()
print("Get all <p> elements: ", paragraphs)
print()

# Get all <p> elements with a class attribute with value 'hide'
print("Gets all the p elements with a class attribute with value 'hide': ", ritsoup.find_all("p", attrs={"class": "hide"}))
print()

# Obtain strings from a specific element
print("Obtaining strings: ", ritsoup.h1.string if ritsoup.h1 else "No h1 tag found.")
print()

# The contents method is similar but always returns a list
print("Obtaining strings using contents method: ", ritsoup.h1.contents if ritsoup.h1 else "No h1 tag found.")
print()

# If the element contains any tags, then string will return None
if len(paragraphs) > 2:
    print(paragraphs[2], "\n")
    print(paragraphs[2].string, "\n")

    # Contents will return a list as before, mixing different kinds of elements
    para2 = paragraphs[2].contents
    print(para2, "\n")

if len(paragraphs) > 5:
    para7 = paragraphs[5].contents
    print(para7, "\n")

# You can also use stripped_strings, which is a generator over all the strings (tags removed)
# inside the element; this is a fast way to extract the raw texts, with all tag soup strained
if len(paragraphs) > 3:
    print("Stripped strings from paragraph 3:")
    for s in paragraphs[3].stripped_strings:
        print("=" * 50)
        print(s)



<!DOCTYPE html>

<html class="is-fullheight" lang="en-US">
<head>
<title>Purdue University</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="http://gmpg.org/xfn/11" rel="profile"/>
<style type="text/css">@media (min-width:1024px){
.purdue-home-news-events__list .purdue-home-news-events__title{
min-height:3.5rem;
}
}
.purdue-home-hot-spot {
    overflow: visible !important;
}
.purdue-home-slide__hot-spot-mobile{
    overflow: hidden;
}

/***
.purdue-home-button-list li:last-child .purdue-home-button{
    background: #000;
    color: #fff;
}
.purdue-home-button-list li:last-child .purdue-home-button:after{
    background-image: url("data:image/svg+xml,%3Csvg xmlns=%27http://www.w3.org/2000/svg%27 viewBox=%270 0 448 512%27%3E%3C%21--%21 Font Awesome Pro 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license %28Commercial License%29 Copyright 2023 Fonticons, Inc. --%3E%3Cpath fill=%27%23cfb9