#  NLP + Statistics 

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
import spacy

## NLP

### 1. Write a program of text processing

In [2]:
# Download required NLTK datasets (only the first time)
nltk.download('punkt')
nltk.download('stopwords')

# Sample text for processing
text = """
Electronics and Communication Engineering (ECE) is a dynamic and rapidly evolving branch of engineering that combines principles from both
electronics and communication technologies. This field plays a pivotal role in the development of modern technologies and contributes significantly
to various sectors, including telecommunications, aerospace, healthcare, and automation.
"""

# Step 1: Convert text to lowercase
text_lower = text.lower()

# Step 2: Remove punctuation using regex
text_cleaned = re.sub(r'[^\w\s]', '', text_lower)

# Step 3: Tokenize the text into words
tokens = word_tokenize(text_cleaned)

# Step 4: Remove stop words (common words like "the", "and", etc.)
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Step 5: Frequency Distribution
freq_dist = FreqDist(filtered_tokens)

# Output processed results
print("Original Text:")
print(text)
print("\nProcessed Text:")
print(text_cleaned)
print("\nFiltered Tokens (without stopwords):")
print(filtered_tokens)
print("\nFrequency Distribution of Tokens:")
print(freq_dist)

Original Text:

Electronics and Communication Engineering (ECE) is a dynamic and rapidly evolving branch of engineering that combines principles from both
electronics and communication technologies. This field plays a pivotal role in the development of modern technologies and contributes significantly
to various sectors, including telecommunications, aerospace, healthcare, and automation.


Processed Text:

electronics and communication engineering ece is a dynamic and rapidly evolving branch of engineering that combines principles from both
electronics and communication technologies this field plays a pivotal role in the development of modern technologies and contributes significantly
to various sectors including telecommunications aerospace healthcare and automation


Filtered Tokens (without stopwords):
['electronics', 'communication', 'engineering', 'ece', 'dynamic', 'rapidly', 'evolving', 'branch', 'engineering', 'combines', 'principles', 'electronics', 'communication', 'technolog

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\addyf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\addyf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2. Write a program to implement NLP based upon spacy 

In [None]:
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text for processing
text = """
Electronics and Communication Engineering (ECE) is a dynamic and rapidly evolving branch of engineering that combines principles from both
electronics and communication technologies. This field plays a pivotal role in the development of modern technologies and contributes significantly
to various sectors, including telecommunications, aerospace, healthcare, and automation.
"""

# Process the text using SpaCy
doc = nlp(text)

# Step 1: Tokenization (spaCy automatically tokenizes the text)
print("Tokens:")
for token in doc:
    print(token.text)

# Step 2: Part-of-Speech Tagging (POS)
print("\nPart-of-Speech Tagging:")
for token in doc:
    print(f"{token.text}: {token.pos_}")

# Step 3: Named Entity Recognition (NER)
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Step 4: Dependency Parsing
print("\nDependency Parsing:")
for token in doc:
    print(f"{token.text} <-{token.dep_}- {token.head.text}")

# Step 5: Lemmatization
print("\nLemmatization:")
for token in doc:
    print(f"{token.text} -> {token.lemma_}")

## Statistics

### 1. Difference between descriptive and inferential statistics. Write down atleast 10-15 differences

| **Descriptive Statistics**                           | **Inferential Statistics**                             |
|------------------------------------------------------|--------------------------------------------------------|
| Aims to describe or summarize a data set.            | Aims to make predictions or inferences about a population based on a sample. |
| Deals with data that is collected or observed.       | Uses sample data to infer properties of a larger population. |
| Examples: mean, median, mode, variance, standard deviation. | Examples: hypothesis testing, confidence intervals, regression analysis. |
| No generalization is made beyond the data.           | Inferences are made to generalize results beyond the sample. |
| Data is presented in the form of tables, graphs, and charts. | Inferences are drawn using statistical models and tests. |
| Focuses on summarizing the characteristics of the sample. | Focuses on making conclusions that extend to a broader population. |
| Provides exact information about the sample.         | Provides probabilistic information about a population. |
| No conclusions are made about causality.             | Can be used to test hypotheses and determine causal relationships. |
| Examples of measures: central tendency, dispersion.  | Examples of tests: t-tests, chi-square tests, ANOVA. |
| More concerned with the "what" (describing the data).| More concerned with the "why" or "how" (testing theories or hypotheses). |
| Analysis can be simple and does not require assumptions about data distribution. | Often requires assumptions (normal distribution, randomness, etc.) to be valid. |
| Useful for understanding the structure of the data.  | Useful for drawing conclusions from data and making decisions. |
| Primarily concerned with describing data features.   | Primarily concerned with making predictions or inferences. |
| Examples: calculating average sales, total revenue, etc. | Example: estimating the population mean based on a sample, testing the effectiveness of a drug. |
| Often used in exploratory data analysis (EDA).        | Often used in confirmatory data analysis (CDA). |
