In [2]:

#tokenization
import re

text = input("Enter text: ")
text=text.lower()
sentences = re.split(r'[.!?]+', text)
sentences = [s.strip() for s in sentences if s.strip()]
words = re.findall(r'\b\w+\b', text)

print("\nSentences:")
print(sentences)
print("\nWords:")
print(words)

Enter text:  Runners running.



Sentences:
['Runners running']

Words:
['Runners', 'running']


In [3]:

#porter stemmer
def simple_porter_stem(word):
    word = word.lower()
    # Example: remove common prefixes using startswith()
    if word.startswith("re"):
        word = word[2:]
    elif word.startswith("un"):
        word = word[2:]
    # Remove plural endings
    if word.endswith("sses"):
        word = word[:-2]
    elif word.endswith("ies"):
        word = word[:-2]
    elif word.endswith("s") and not word.endswith("ss"):
        word = word[:-1]
    # Remove past/continuous endings
    elif word.endswith("ing"):
        word = word[:-3]
    elif word.endswith("ed"):
        word = word[:-2]
    return word

# Example
words = ["redoing", "unwanted", "caresses", "running", "cats"]
for w in words:
    print(w, "->", simple_porter_stem(w))

redoing -> do
unwanted -> want
caresses -> caress
running -> runn
cats -> cat


In [2]:

#sentence boundary

from sklearn.linear_model import LogisticRegression

m=LogisticRegression().fit([[1],[0]],[1,0])
t="Wow! Is this working? Yes it is.".split()
o=[t[0]]

for i in range(1,len(t)):
    if t[i-1][-1] in ".!?" and m.predict([[t[i][0].isupper()]])[0]:
        o.append(t[i])
    else:
        o[-1]+=" "+t[i]

print(o)

['Wow!', 'Is this working?', 'Yes it is.']


In [3]:

#topic boundary

from sklearn.linear_model import LogisticRegression

m=LogisticRegression().fit([[0],[1]],[0,1])
s=["AI is powerful","Machine learning improves AI","Football is popular","The match was exciting"]

t=1;print("TOPIC",t,":");print(s[0])
prev=1

for i in range(1,len(s)):
    o=len(set(s[i-1].split())&set(s[i].split()))>0
    if prev and not o and m.predict([[1]])[0]:
        t+=1;print("\nTOPIC",t,":")
    print(s[i]);prev=o

TOPIC 1 :
AI is powerful
Machine learning improves AI

TOPIC 2 :
Football is popular
The match was exciting


In [4]:
#phrase structure constituency

import nltk
from nltk import CFG
from nltk.tree import Tree

grammar = CFG.fromstring("""
S -> NP VP
NP -> DT NN
VP -> V NP
DT -> 'the'
NN -> 'boy' | 'ball'
V -> 'hit'
""")

parser = nltk.ChartParser(grammar)

sentence = "the boy hit the ball".split()
for tree in parser.parse(sentence):
    tree.pretty_print()

             S              
      _______|___            
     |           VP         
     |        ___|___        
     NP      |       NP     
  ___|___    |    ___|___    
 DT      NN  V   DT      NN 
 |       |   |   |       |   
the     boy hit the     ball



In [1]:
#dependency tree

import nltk
from nltk.grammar import DependencyGrammar
from nltk.parse import ProjectiveDependencyParser

# Define dependency grammar
grammar = DependencyGrammar.fromstring("""
'hit' -> 'boy' | 'ball'
'boy' -> 'the'
'ball' -> 'the'
""")

# Create parser
parser = ProjectiveDependencyParser(grammar)

# Input sentence
sentence = "the boy hit the ball".split()

# Parse and print tree
for tree in parser.parse(sentence):
    print(tree)
    tree.pretty_print()

(hit (boy the) (ball the))
    hit     
  ___|___    
boy     ball
 |       |   
the     the 



In [5]:
#shift reduce
import nltk
from nltk import CFG
from nltk.parse import ShiftReduceParser

g = CFG.fromstring("""
S -> NP VP
NP -> DT NN
VP -> V NP
DT -> 'the'
NN -> 'boy' | 'ball'
V -> 'hit'
""")

p = ShiftReduceParser(g)
s = "the boy hit the ball".split()

print("STACK\t\tINPUT\t\tACTION")
stk = []
inp = s[:]

for w in s:
    stk.append(w)
    inp.pop(0)
    print(stk, "\t", inp, "\tSHIFT")

print(stk, "\t", inp, "\tREDUCE\n")

for t in p.parse(s):
    t.pretty_print()

STACK		INPUT		ACTION
['the'] 	 ['boy', 'hit', 'the', 'ball'] 	SHIFT
['the', 'boy'] 	 ['hit', 'the', 'ball'] 	SHIFT
['the', 'boy', 'hit'] 	 ['the', 'ball'] 	SHIFT
['the', 'boy', 'hit', 'the'] 	 ['ball'] 	SHIFT
['the', 'boy', 'hit', 'the', 'ball'] 	 [] 	SHIFT
['the', 'boy', 'hit', 'the', 'ball'] 	 [] 	REDUCE

             S              
      _______|___            
     |           VP         
     |        ___|___        
     NP      |       NP     
  ___|___    |    ___|___    
 DT      NN  V   DT      NN 
 |       |   |   |       |   
the     boy hit the     ball

